Merge branch 'main' into develop

2023-09-05 17:45:26 +08:00 · 2023-09-05 17:45:26 +08:00 · 0e62d41137
parent 53648dc0e9 5238f15e2d
commit 0e62d41137
71 changed files with 3116 additions and 898 deletions
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@ -39,7 +39,7 @@ jobs:
    needs: check-requirements
    runs-on: [lmtest]
    steps:
-    - name: mask env 
+    - name: mask env
      run: |
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
    - uses: actions/checkout@v3
@ -60,15 +60,29 @@ jobs:
    runs-on: [lmtest]
    timeout-minutes: 30
    steps:
-    - name: mask env 
+    - name: mask env
      run: |
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
    - uses: actions/checkout@v3
    - name: slurm-train
      id: basic_train
      run: |
        source activate internlm-env-test
        sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
    - name: load_preset_ckpt
      if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
      run: |
        source activate internlm-env-test
        export PYTHONPATH=$PWD:$PYTHONPATH
        sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
    - name: load_new_ckpt
      run: |
        source activate internlm-env-test
        export PYTHONPATH=$PWD:$PYTHONPATH
        sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
        rm -rf $GITHUB_WORKSPACE/llm_ckpts
    - name: torchrun-train
@ -91,18 +105,17 @@ jobs:
      run: |
        source activate internlm-env-test
        export PYTHONPATH=$PWD:$PYTHONPATH
-        sh ./ci_scripts/model/convert_to_hf.sh 
+        sh ./ci_scripts/model/convert_to_hf.sh
        cd ./hf_ckpt
        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
        cd ..
        rm -rf $GITHUB_WORKSPACE/hf_ckpt
  load-chat-model-in-hf:
    if: ${{ always() }}
    needs: check-requirements
    runs-on: [lmtest]
    steps:
-    - name: mask env 
+    - name: mask env
      run: |
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
    - uses: actions/checkout@v3
--- a/.github/workflows/lint_check.yaml
+++ b/.github/workflows/lint_check.yaml
@ -1,6 +1,6 @@
 name: lint-check
-on: 
+on:
  push:
  pull_request:
    branches:
--- a/.github/workflows/sonar.yaml
+++ b/.github/workflows/sonar.yaml
@ -1,7 +1,7 @@
 name: Sonarqube
 on:
  workflow_dispatch:
-     
+
 jobs:
  sonarqube:
    name: SonarQube Scan
@ -13,4 +13,4 @@ jobs:
      - uses: sonarsource/sonarqube-scan-action@master
        env:
          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
-          SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
+          SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,28 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.8"
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
  configuration: doc/code-docs/source/conf.py
  fail_on_warning: false
 # Optionally build your docs in additional formats such as PDF
 formats:
  - pdf
 # Optional but recommended, declare the Python requirements required
 # to build your documentation
 # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
   install:
   - requirements: doc/code-docs/requirements.txt
--- a/README-ja-JP.md
+++ b/README-ja-JP.md
@ -40,6 +40,10 @@ InternLM は、70 億のパラメータを持つベースモデルと、実用
 さらに、大規模な依存関係を必要とせずにモデルの事前学習をサポートする軽量な学習フレームワークが提供されます。単一のコードベースで、数千の GPU を持つ大規模クラスタでの事前学習と、単一の GPU での微調整をサポートし、顕著な性能最適化を達成します。InternLM は、1024GPU でのトレーニングにおいて 90% 近いアクセラレーション効率を達成しています。
 ## 新闻
 InternLM-7B-Chat v1.1 は、コード インタプリタと関数呼び出し機能を備えてリリースされました。 [Lagent](https://github.com/InternLM/lagent) で試すことができます。
 ## InternLM-7B
 ### パフォーマンス評価
@ -80,8 +84,8 @@ Transformers を使用して InternLM 7B チャットモデルをロードする
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "こんにちは", history=[])
 >>> print(response)
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@ -45,6 +45,10 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数
 提供了支持模型预训练的轻量级训练框架，无需安装大量依赖包，一套代码支持千卡预训练和单卡人类偏好对齐训练，同时实现了极致的性能优化，实现千卡训练下近90%加速效率。
 ## 新闻
 我们开源了 InternLM-Chat-7B v1.1。该模型能够调用代码解释器和工具插件。你可以在 [Lagent](https://github.com/InternLM/lagent) 中体验这些新功能。
 ## InternLM-7B
 ### 性能评测
@ -74,6 +78,7 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数
 | 模型                 | InternLM 格式权重下载地址                                                                                                                      | Transformers 格式权重下载地址                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
 | **InternLM 7B**      | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
 | **InternLM Chat 7B v1.1**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1)    | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1)       |
 | **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
 | **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k)
@ -85,8 +90,8 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "你好", history=[])
 >>> print(response)
@ -117,26 +122,44 @@ streamlit run web_demo.py
 我们使用 [LMDeploy](https://github.com/InternLM/LMDeploy) 完成 InternLM 的一键部署。
-1. 首先安装 LMDeploy:
+```bash
 python3 -m pip install lmdeploy
 ```
-   ```bash
+执行以下命令，可以在终端与 `internlm-chat-7b` 模型进行交互式对话，或者通过 WebUI 与它聊天。
   python3 -m pip install lmdeploy
   ```
-2. 快速的部署命令如下：
+```bash
 # 转换权重格式
 python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
-   ```bash
+# 在终端进行交互式对话
-   python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-7b/model
+python3 -m lmdeploy.turbomind.chat ./workspace
   ```
-3. 在导出模型后，你可以直接通过如下命令启动服务，并在客户端与AI对话
+# 启动 gradio 服务
 python3 -m lmdeploy.serve.gradio.app ./workspace
 ```
 以上过程中，LMDeploy 使用的是 FP16 的计算精度。
-   ```bash
+除了 FP16 精度，LMDeploy 还支持 `internlm-chat-7b` 4bit 权重模型推理。它不仅把模型的显存减少到 6G，大约只有 FP16 的 40%，更重要的是，经过 kernel 层面的极致优化，其推理性能在 A100-80G 上可达到 FP16 的 2.4 倍以上。
-   bash workspace/service_docker_up.sh
+
-   python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+以下是`internlm-chat-7b` 4bit 权重模型的部署方法。推理速度的 bechmark 请参考[这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md#%E6%8E%A8%E7%90%86%E9%80%9F%E5%BA%A6)
-   ```
+
 ```bash
 # download prequnantized internlm-chat-7b model from huggingface
 git-lfs install
 git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
 # Convert the model's layout and store it in the default path, ./workspace.
 python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
 # inference lmdeploy's turbomind engine
 python3 -m lmdeploy.turbomind.chat ./workspace
 # serving with gradio
 python3 -m lmdeploy.serve.gradio.app ./workspace
 ```
 LMDeploy 是涵盖了 LLM 任务的全套轻量化、部署和服务的工具箱。请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。
 [LMDeploy](https://github.com/InternLM/LMDeploy) 支持了 InternLM 部署的完整流程，请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。
 ## 微调&训练
--- a/README.md
+++ b/README.md
@ -45,6 +45,10 @@ InternLM has open-sourced a 7 billion parameter base model and a chat model tail
 Additionally, a lightweight training framework is offered to support model pre-training without the need for extensive dependencies. With a single codebase, it supports pre-training on large-scale clusters with thousands of GPUs, and fine-tuning on a single GPU while achieving remarkable performance optimizations. InternLM achieves nearly 90% acceleration efficiency during training on 1024 GPUs.
 ## News
 InternLM-7B-Chat v1.1 is released with code interpreter and function calling capability. You can try it with [Lagent](https://github.com/InternLM/lagent).
 ## InternLM-7B
 ### Performance Evaluation
@ -74,6 +78,7 @@ InternLM 7B and InternLM 7B Chat, trained using InternLM, have been open-sourced
 | Model                         | InternLM Format Weight Download Link                                                                                                                 | Transformers Format Weight Download Link                                         |
 | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
 | **InternLM 7B**         | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b)         | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b)                 |
 | **InternLM Chat 7B v1.1**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1)    | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1)       |
 | **InternLM Chat 7B**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b)    | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)       |
 | **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k) |
@ -85,8 +90,8 @@ To load the InternLM 7B Chat model using Transformers, use the following code:
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "hello", history=[])
 >>> print(response)
@ -118,28 +123,45 @@ The effect is as follows
 ### Deployment
-We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the one-click deployment of InternLM.
+We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the workflow of InternLM deployment.
-1. First, install LMDeploy:
+```bash
 python3 -m pip install lmdeploy
 ```
-    ```bash
+You can utilize the following commands to conduct `internlm-chat-7b` FP16 inference, serve it and interact with AI assistant via WebUI:
    python3 -m pip install lmdeploy
    ```
-2. Use the following command for quick deployment:
+```bash
 # convert weight layout
 python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
-    ```bash
+# inference lmdeploy's turbomind engine
-    python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b/model
+python3 -m lmdeploy.turbomind.chat ./workspace
    ```
-3. After exporting the model, you can start a server and have a conversation with the deployed model using the following command:
+# serving with gradio
-   
+python3 -m lmdeploy.serve.gradio.app ./workspace
-    ```bash
+```
    bash workspace/service_docker_up.sh
    python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
    ```
-[LMDeploy](https://github.com/InternLM/LMDeploy) provides a complete workflow for deploying InternLM. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
+You can also deploy 4-bit quantized `internlm-chat-7b` model via LMDeploy. It greatly trims down the model's memory overhead to 6G, just 40% of what FP16 inference would take. More importantly, with extreme optimized kernel, the inference performance achieves 2.4x faster than FP16 inference on A100-80G.
 Try the followings to enjoy 4-bit `internlm-chat-7b` on a Geforce RTX 30x GPU card. You can find the inference benchmark from [here](https://github.com/InternLM/lmdeploy/blob/main/docs/en/w4a16.md#inference-performance).
 ```bash
 # download prequnantized internlm-chat-7b model from huggingface
 git-lfs install
 git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
 # Convert the model's layout and store it in the default path, ./workspace.
 python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
 # inference lmdeploy's turbomind engine
 python3 -m lmdeploy.turbomind.chat ./workspace
 # serving with gradio
 python3 -m lmdeploy.serve.gradio.app ./workspace
 ```
 LMDeploy is an efficient toolkit for compressing, deploying, and serving LLM models. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
 ## Fine-tuning & Training
--- a/ci_scripts/common/com_func.py
+++ b/ci_scripts/common/com_func.py
@ -0,0 +1,29 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 def merge_dicts(dict_a: dict, dict_b: dict):
    for key in dict_b.keys():
        if isinstance(dict_b[key], dict):
            dict_b[key] = {**dict_a[key], **dict_b[key]}
            merge_dicts(dict_a[key], dict_b[key])
    dict_c = {**dict_a, **dict_b}
    return dict_c
 def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
    result = ""
    for key, value in data.items():
        if isinstance(value, dict):
            result += f"{' ' * indent}{key} = dict(\n"
            result += format_dict_to_py_string(value, indent + 4, is_nested=True)
            result += f"{' ' * indent})"
        else:
            result += f"{' ' * indent}{key} = {repr(value)}"
        if is_nested:
            result += ","
        result += "\n"
    result = f"""\
 {result}
 """
    return result
--- a/ci_scripts/data/tokenizer_alpaca.sh
+++ b/ci_scripts/data/tokenizer_alpaca.sh
@ -16,7 +16,7 @@ exit_code=0
 source ./ci_scripts/common/basic_func.sh
-echo "start to test alpaca_tokenizer.py." 
+echo "start to test alpaca_tokenizer.py."
 if [[ -d ${RESULTS} ]]; then
    if ! rm -rf ${RESULTS}/*; then
--- a/ci_scripts/data/tokenizer_chinese.sh
+++ b/ci_scripts/data/tokenizer_chinese.sh
@ -12,7 +12,7 @@ exit_code=0
 source ./ci_scripts/common/basic_func.sh
-echo "start to test tokenizer.py." 
+echo "start to test tokenizer.py."
 num=$(num_files "${RESULTS}")
 if [[ ${num} -gt 0 ]]; then
--- a/ci_scripts/model/convert_to_hf.sh
+++ b/ci_scripts/model/convert_to_hf.sh
@ -40,7 +40,7 @@ num=$(num_files "${CKPTS_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
-    exit_code=$(($exit_code + 1)) 
+    exit_code=$(($exit_code + 1))
 fi
 # NOTICE: should not remove the cached files, because the cached files will be used in the next test case.
--- a/ci_scripts/model/demo_load_7B_chat_model.py
+++ b/ci_scripts/model/demo_load_7B_chat_model.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
--- a/ci_scripts/train/ci_7B_sft.py
+++ b/ci_scripts/train/ci_7B_sft.py
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # oss: 'boto3:s3://model_weights/XXX'
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # SAVE_CKPT_FOLDER = "local:llm_ckpts"
 SAVE_CKPT_FOLDER = "local:llm_ckpts"
 # LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
@ -27,7 +27,7 @@ ckpt = dict(
    load_optimizer=True,
 )
-TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
+TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
 data = dict(
    seq_len=SEQ_LEN,
    # micro_num means the number of micro_batch contained in one gradient update
@ -120,8 +120,8 @@ zero1 parallel:
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-pipeline parallel: pipeline parallel size.
+pipeline parallel: pipeline parallel size, only 1 is accepted currently.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
+tensor parallel: tensor parallel size, usually the number of GPUs per node, only 1 is accepted currently.
 """
 parallel = dict(
    zero1=8,
--- a/ci_scripts/train/generate_config.py
+++ b/ci_scripts/train/generate_config.py
@ -0,0 +1,49 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import argparse
 import json
 import os
 from ci_scripts.common import com_func
 from internlm.core.context import Config
 def generate_new_config(config_py_file, test_config_json, case_name):
    # generate path of the new config py
    config_path = os.path.split(config_py_file)
    new_config_py_file = os.path.join(config_path[0], case_name + ".py")
    # merge dict
    origin_config = Config.from_file(config_py_file)
    with open(test_config_json) as f:
        test_config = json.load(f)
    if test_config:
        if case_name not in test_config.keys():
            raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
    new_config = com_func.merge_dicts(origin_config, test_config[case_name])
    print(f"new config is:\n{new_config}")
    # write new config to py file
    file_content = com_func.format_dict_to_py_string(new_config)
    with open(new_config_py_file, "w") as f:
        f.write(file_content)
    print(f"The new test train config file is {new_config_py_file}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--origin_config",
        type=str,
        default="./ci_scripts/train/ci_7B_sft.py",
        help="path to the origin train config file",
    )
    parser.add_argument(
        "--test_config",
        type=str,
        default="./ci_scripts/train/test_config.json",
        help="path to the test train config file",
    )
    parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
    args = parser.parse_args()
    generate_new_config(args.origin_config, args.test_config, args.case_name)
--- a/ci_scripts/train/load_ckpt.sh
+++ b/ci_scripts/train/load_ckpt.sh
@ -0,0 +1,38 @@
 #!/bin/bash
 set -x
 [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
 readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
 expected_num=22
 exit_code=0
 source ./ci_scripts/common/basic_func.sh
 echo "start to test slurm training with loading checkpoint."
 python ./ci_scripts/train/generate_config.py --case_name $1
 file="./ci_scripts/train/$1.py"
 if [[ ! -f ${file} ]]; then
        echo "expect: ${file} exists, actual: not exist."
        exit_code=$(($exit_code + 1))
    fi
 srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
 [[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 num=$(num_files "${CKPTS40_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
    exit_code=$(($exit_code + 1))
 fi
 # clean the test files.
 if ! rm -rf ${CKPTS_PATH}/*; then
    echo "cleaning cached file in ${CKPTS_PATH} failed."
    exit_code=$(($exit_code + 1))
 fi
 exit $exit_code
--- a/ci_scripts/train/slurm_train.sh
+++ b/ci_scripts/train/slurm_train.sh
@ -5,7 +5,7 @@ set -x
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
 readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
-expected_num=21
+expected_num=22
 exit_code=0
 source ./ci_scripts/common/basic_func.sh
@ -25,12 +25,6 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --
 num=$(num_files "${CKPTS20_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
    exit_code=$(($exit_code + 1)) 
 fi
 # clean the test files.
 if ! rm -rf ${CKPTS_PATH}/*; then
    echo "cleaning cached file in ${CKPTS_PATH} failed."
    exit_code=$(($exit_code + 1))
 fi
--- a/ci_scripts/train/test_config.json
+++ b/ci_scripts/train/test_config.json
@ -0,0 +1,45 @@
 {
    "7B_basic_train": {
        "SEQ_LEN": 1024,
        "HIDDEN_SIZE": 2048,
        "NUM_ATTENTION_HEAD": 16,
        "NUM_LAYER": 16,
        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
        "ckpt": {
            "checkpoint_every": 20
        },
        "data": {
            "total_steps": 20
        }
    },
    "7B_load_new_ckpt": {
        "SEQ_LEN": 1024,
        "HIDDEN_SIZE": 2048,
        "NUM_ATTENTION_HEAD": 16,
        "NUM_LAYER": 16,
        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
        "LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
        "ckpt": {
            "load_ckpt_folder": "local:llm_ckpts/20",
            "checkpoint_every": 20
        },
        "data": {
            "total_steps": 40
        }
    },
    "7B_load_preset_ckpt": {
        "SEQ_LEN": 1024,
        "HIDDEN_SIZE": 2048,
        "NUM_ATTENTION_HEAD": 16,
        "NUM_LAYER": 16,
        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
        "LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
        "ckpt": {
            "load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
            "checkpoint_every": 20
        },
        "data": {
            "total_steps": 40
        }
    }
 }
--- a/ci_scripts/train/torchrun.sh
+++ b/ci_scripts/train/torchrun.sh
@ -5,7 +5,7 @@ set -x
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
 readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
-expected_num=21
+expected_num=22
 exit_code=0
 source ./ci_scripts/common/basic_func.sh
@ -25,7 +25,7 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --
 num=$(num_files "${CKPTS_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
-    exit_code=$(($exit_code + 1)) 
+    exit_code=$(($exit_code + 1))
 fi
 # clean the test files.
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@ -75,7 +75,8 @@ grad_scaler = dict(
 hybrid_zero_optimizer = dict(
    # Enable low_level_optimzer overlap_communication
-    zero_overlap_communication=True,
+    overlap_sync_grad=True,
    overlap_sync_param=True,
    # bucket size for nccl communication params
    reduce_bucket_size=512 * 1024 * 1024,
    # grad clipping
@ -120,12 +121,11 @@ model = dict(
    num_layers=NUM_LAYER,
    mlp_ratio=MLP_RATIO,
    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
    norm_type="rmsnorm",
    layer_norm_epsilon=1e-5,
    use_flash_attn=True,
    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
    sequence_parallel=False,
 )
 """
 zero1 parallel:
@ -142,6 +142,7 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node.
 parallel = dict(
    zero1=8,
    pipeline=dict(size=1, interleaved_overlap=True),
    sequence_parallel=False,
 )
 cudnn_deterministic = False
--- a/doc/code-docs/Makefile
+++ b/doc/code-docs/Makefile
@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/code-docs/make.bat
+++ b/doc/code-docs/make.bat
@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=source
 set BUILDDIR=build
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.https://www.sphinx-doc.org/
 	exit /b 1
 )
 if "%1" == "" goto help
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@ -0,0 +1,10 @@
 Sphinx
 sphinx-autobuild
 recommonmark
 sphinx_rtd_theme
 sphinx_markdown_tables
 autodoc_pydantic==1.9
 enum_tools
 numpy
 torch
 tqdm
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@ -0,0 +1,2 @@
 Model Checkpointing
 ===================
--- a/doc/code-docs/source/conf.py
+++ b/doc/code-docs/source/conf.py
@ -0,0 +1,91 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 import os
 import sys
 project = "InternLM"
 copyright = "2023, InternLM Team"
 author = "InternLM Team"
 release = "v0.2.0"
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    "recommonmark",
    "sphinx_rtd_theme",
    "sphinx.ext.viewcode",
    "sphinx.ext.autodoc",
    "sphinxcontrib.autodoc_pydantic",
    "sphinx.ext.autosectionlabel",
    "sphinx.ext.napoleon",
 ]
 pygments_style = "sphinx"
 # autodoc_pyandtic config
 autodoc_pydantic_model_show_field_summary = False
 autodoc_pydantic_field_signature_prefix = " "
 autodoc_pydantic_model_signature_prefix = "class"
 autodoc_pydantic_model_show_json = False
 autodoc_pydantic_model_show_config_summary = False
 autodoc_pydantic_model_show_config_member = False
 autodoc_pydantic_model_show_validator_summary = False
 autodoc_pydantic_model_show_validator_members = False
 autodoc_pydantic_model_summary_list_order = "bysource"
 autodoc_pydantic_model_member_order = "bysource"
 autodoc_pydantic_field_list_validators = False
 # Napoleon settings
 napoleon_google_docstring = True
 napoleon_numpy_docstring = True
 napoleon_include_init_with_doc = False
 napoleon_include_private_with_doc = False
 napoleon_include_special_with_doc = True
 napoleon_use_admonition_for_examples = False
 napoleon_use_admonition_for_notes = False
 napoleon_use_admonition_for_references = False
 napoleon_use_ivar = False
 napoleon_use_param = True
 napoleon_use_rtype = True
 napoleon_preprocess_types = False
 napoleon_type_aliases = None
 napoleon_attr_annotations = True
 templates_path = ["_templates"]
 exclude_patterns = []
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 html_theme = "sphinx_rtd_theme"
 html_static_path = ["_static"]
 # GitHub integration
 html_context = {
    "display_github": True,
    "github_user": "pjlab",
    "github_repo": "InternLM",
    "github_version": "master",
    "conf_py_path": "/doc/code-docs/source/",
 }
 sys.path.insert(0, os.path.abspath("../../../"))
 # Prepend module names to class descriptions
 add_module_names = True
 autoclass_content = "class"
 autodoc_mock_imports = [
    "apex",
    "torch",
    "numpy",
 ]
--- a/doc/code-docs/source/index.rst
+++ b/doc/code-docs/source/index.rst
@ -0,0 +1,70 @@
 .. InternLM documentation master file, created by
   sphinx-quickstart on Mon Aug 28 17:33:28 2023.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 InternLM
 ========
 Environment Setup
 -------------------
 .. toctree::
   :maxdepth: 2
   install
 Model Setup
 -------------------
 .. toctree::
   :maxdepth: 2
   initialize
 Training API
 -------------------
 .. toctree::
   :maxdepth: 2
   training
 Parallel Training
 -------------------
 .. toctree::
   :maxdepth: 2
   parallel
 Model Checkpointing
 -------------------
 .. toctree::
   :maxdepth: 2
   checkpoint
 Profiler
 -------------------
 .. toctree::
   :maxdepth: 2
   profiler
 Monitor
 -------------------
 .. toctree::
   :maxdepth: 2
   monitor
 Indices and tables
 ==================
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@ -0,0 +1,35 @@
 Training Setup
 ==============
 .. _InternLM-args:
 Argument Parsing
 ----------------
 InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline
 configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default
 parser with some builtin arguments, users can add custom parameters to this parser.
 .. code-block:: python
    # Get InternLM default parser
    parser = internlm.initialize.get_default_parser()
    # Add new argument
    parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.")
    cmd_args = parser.parse_args()
 .. autofunction:: internlm.initialize.get_default_parser
 .. _InternLM-init:
 Model Initialization
 -------------------------
 Optimizer Initialization
 -------------------------
 Dataloader Initialization
 -------------------------
 Trainer Initialization
 -------------------------
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@ -0,0 +1,70 @@
 ## Installation
 ### Environment Preparation
 The required packages and corresponding version are shown as follows:
 - Python == 3.10
 - GCC == 10.2.0
 - MPFR == 4.1.0
 - CUDA >= 11.7
 - Pytorch >= 1.13.1
 - Transformers >= 4.28.0
 - Flash-Attention >= v1.0.5
 - Apex == 23.05
 - GPU with Ampere or Hopper architecture (such as H100, A100)
 - Linux OS
 After installing the above dependencies, some system environment variables need to be updated:
 ```bash
 export CUDA_PATH={path_of_cuda_11.7}
 export GCC_HOME={path_of_gcc_10.2.0}
 export MPFR_HOME={path_of_mpfr_4.1.0}
 export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
 export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
 export CC=${GCC_HOME}/bin/gcc
 export CXX=${GCC_HOME}/bin/c++
 ```
 ### Environment Installation
 Clone the project `internlm` and its dependent submodules from the github repository, as follows:
 ```bash
 git clone git@github.com:InternLM/InternLM.git --recurse-submodules
 ```
 It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
 ```bash
 conda create --name internlm-env python=3.10 -y
 conda activate internlm-env
 cd internlm
 pip install -r requirements/torch.txt 
 pip install -r requirements/runtime.txt 
 ```
 Install flash-attention (version v1.0.5):
 ```bash
 cd ./third_party/flash-attention
 python setup.py install
 cd ./csrc
 cd fused_dense_lib && pip install -v .
 cd ../xentropy && pip install -v .
 cd ../rotary && pip install -v .
 cd ../layer_norm && pip install -v .
 cd ../../../../
 ```
 Install Apex (version 23.05):
 ```bash
 cd ./third_party/apex
 pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
 cd ../../
 ```
 ### Environment Image
 Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
 ```bash
 # pull image
 docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
 # start container
 docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
 docker exec -it myinternlm bash
 ```
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@ -0,0 +1,10 @@
 Monitor and Alert
 =================
 Monitoring
 -----------------
 Alerting
 -----------------
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@ -0,0 +1,23 @@
 Parallel Training
 =================
 .. 整体说一下并行配置使用方式，接下来再分模块详细说明
 Tensor Parallel
 -----------------
 Pipeline Parallel
 -----------------
 Sequence Parallel
 -----------------
 Data Parallel
 -----------------
 ZeRO1.5
 -----------------
--- a/doc/code-docs/source/profiler.rst
+++ b/doc/code-docs/source/profiler.rst
@ -0,0 +1,11 @@
 Profiler
 ========
 .. 可介绍torch profiler, memory profiler的使用
 Torch Profiler
 -----------------
 Memory Profiler
 -----------------
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@ -0,0 +1,2 @@
 Training API
 ============
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -59,12 +59,28 @@ cd ../../
 ```
 ### Environment Image
-Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
+Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm.
 #### Image Configuration and Build
 The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
 ``` bash
 make -f docker.Makefile BASE_OS=centos7
 ``` 
 In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
 #### Pull Standard Image
 The standard image based on ubuntu and centos has been built and can be directly pulled:
 ```bash
-# pull image
+# ubuntu20.04
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
-# start container
+# centos7
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
 docker exec -it myinternlm bash
 ```
 #### Run Container
 For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
 ```bash
 docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
 ```
 The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md).
--- a/doc/en/structure.md
+++ b/doc/en/structure.md
@ -6,11 +6,14 @@ The system code file structure is shown below:
 ├── internlm                                 # Main directory of the system code
 │   ├── apis                                 # Interface module, containing some interface functions related to inference, etc.
 │   ├── core                                 # Core module, managing parallel context and training scheduling engine for training and inference
 │   │   ├── communication                    # Communication module, responsible for p2p communication in pipeline parallel scheduling
 │   │   ├── context                          # Context module, mainly responsible for initializing parallel process groups and managing parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
 │   │   ├── scheduler                        # Scheduling module, which manages schedulers for parallel training, including non-pipeline and pipeline parallel schedulers
 │   │   │   ├── no_pipeline_scheduler.py
 │   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # Responsible for managing the training and evaluation process of the model
 │   │   ├── no_pipeline_scheduler.py         # Scheduler for parallel training
 │   │   └── trainer.py                       # Responsible for managing the training engine and scheduler
 │   ├── data                                 # Data module, responsible for managing dataset generation and processing
 │   ├── initialize                           # Initialization module, responsible for managing distributed environment startup and trainer initialization
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@ -165,8 +165,9 @@ Training parallel configuration example:
 ```python
 parallel = dict(
    zero1=8,
    pipeline=1,
    tensor=1,
    pipeline=dict(size=1, interleaved_overlap=True),
    sequence_parallel=False,
 )
 ```
@ -174,8 +175,11 @@ parallel = dict(
  - When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
  - When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
  - When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
- pipeline: pipeline parallel size, default value is 1
+- tensor: tensor parallel size, usually the number of GPUs per node, default is 1
- tensor: tensor parallel size, usually the number of GPUs per node, default value is 1
+- pipeline: pipeline parallel strategy
   - size: pipeline parallel size, the default value is 1
   - interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
 - sequence_parallel: Whether to enable sequence parallelism, the default value is False
 Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
--- a/doc/install.md
+++ b/doc/install.md
@ -59,11 +59,28 @@ cd ../../
 ```
 ### 环境镜像
-用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像，拉取镜像及启动容器的命令如下：
+用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。
 #### 镜像配置及构造
 dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build 镜像：
 ``` bash
 make -f docker.Makefile BASE_OS=centos7
 ``` 
 在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
 #### 镜像拉取
 基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用：
 ```bash
-# 拉取镜像
+# ubuntu20.04
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
-# 启动容器
+# centos7
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
 docker exec -it myinternlm bash
 ```
 #### 容器启动
 对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
 ```bash
 docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
 ```
 容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。
--- a/doc/structure.md
+++ b/doc/structure.md
@ -6,11 +6,14 @@
 ├── internlm                                 # 系统代码的主目录
 │   ├── apis                                 # 接口模块，包含一些关于推理等的接口函数
 │   ├── core                                 # 核心模块，管理用于训练和推理的 parallel context 和训练调度引擎
 │   │   ├── communication                    # 通信模块，负责流水线并行调度中的p2p通信
 │   │   ├── context                          # context 模块，主要负责初始化并行进程组，并管理 parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
 │   │   ├── scheduler                        # 调度模块，管理并行训练的调度器，包括非流水线并行调度器和流水线并行调度器
 │   │   │   ├── no_pipeline_scheduler.py
 │   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # 负责管理模型的训练和评估过程
 │   │   ├── no_pipeline_scheduler.py         # 并行训练的调度器
 │   │   └── trainer.py                       # 负责管理训练引擎和调度器
 │   ├── data                                 # 数据模块，负责管理数据集生成和处理
 │   ├── initialize                           # 初始化模块，负责管理分布式环境启动和训练器初始化
--- a/doc/usage.md
+++ b/doc/usage.md
@ -151,16 +151,20 @@ model = dict(
 ```python
 parallel = dict(
    zero1=8,
    pipeline=1,
    tensor=1,
    pipeline=dict(size=1, interleaved_overlap=True),
    sequence_parallel=False,
 )
 ```
 - zero1：zero 并行策略，分如下三种情况，默认值为 -1
  - 当`size <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
  - 当`size == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
  - 当`size > 1`且`size <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
 - pipeline：流水线并行大小，默认值为 1
 - tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
 - pipeline：流水线并行策略
  - size：流水线并行大小，默认值为 1
  - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭
 - sequence_parallel：是否开启序列化并行，默认值为 False
 注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`
--- a/docker.Makefile
+++ b/docker.Makefile
@ -0,0 +1,107 @@
 DOCKER_REGISTRY          ?= docker.io
 DOCKER_ORG               ?= my
 DOCKER_IMAGE             ?= internlm
 DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
 CUDA_VERSION              = 11.7.1
 GCC_VERSION               = 10.2.0
 CUDNN_VERSION             = 8
 BASE_RUNTIME              =
 # ubuntu20.04  centos7
 BASE_OS                   = centos7
 BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-${BASE_OS}
 # The conda channel to use to install cudatoolkit
 CUDA_CHANNEL              = nvidia
 # The conda channel to use to install pytorch / torchvision
 INSTALL_CHANNEL          ?= pytorch
 PYTHON_VERSION           ?= 3.10
 PYTORCH_VERSION          ?= 1.13.1
 TORCHVISION_VERSION      ?= 0.14.1
 TORCHAUDIO_VERSION       ?= 0.13.1
 BUILD_PROGRESS           ?= auto
 TRITON_VERSION           ?=
 GMP_VERSION              ?= 6.2.1
 MPFR_VERSION             ?= 4.1.0
 MPC_VERSION              ?= 1.2.1
 GCC_VERSION              ?= 10.2.0
 HTTPS_PROXY_I            ?=
 HTTP_PROXY_I             ?=
 FLASH_ATTEN_VERSION      ?= 1.0.5
 FLASH_ATTEN_TAG          ?= v${FLASH_ATTEN_VERSION}
 BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
                            --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
                            --build-arg CUDA_VERSION=$(CUDA_VERSION) \
                            --build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
                            --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
                            --build-arg TORCHVISION_VERSION=$(TORCHVISION_VERSION) \
                            --build-arg TORCHAUDIO_VERSION=$(TORCHAUDIO_VERSION) \
                            --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
                            --build-arg TRITON_VERSION=$(TRITON_VERSION) \
                            --build-arg GMP_VERSION=$(GMP_VERSION) \
                            --build-arg MPFR_VERSION=$(MPFR_VERSION) \
                            --build-arg MPC_VERSION=$(MPC_VERSION) \
                            --build-arg GCC_VERSION=$(GCC_VERSION) \
                            --build-arg https_proxy=$(HTTPS_PROXY_I) \
                            --build-arg http_proxy=$(HTTP_PROXY_I) \
                            --build-arg FLASH_ATTEN_TAG=$(FLASH_ATTEN_TAG)
 EXTRA_DOCKER_BUILD_FLAGS ?=
 BUILD                    ?= build
 # Intentionally left blank
 PLATFORMS_FLAG           ?=
 PUSH_FLAG                ?=
 USE_BUILDX               ?=1
 BUILD_PLATFORMS          ?=
 WITH_PUSH                ?= false
 BUILD_TYPE               ?= intrenlm-dev
 # Setup buildx flags
 ifneq ("$(USE_BUILDX)","")
 BUILD                     =  buildx build
 ifneq ("$(BUILD_PLATFORMS)","")
 PLATFORMS_FLAG            = --platform="$(BUILD_PLATFORMS)"
 endif
 endif
 # endif
 # # Only set platforms flags if using buildx
 # ifeq ("$(WITH_PUSH)","true")
 # PUSH_FLAG               = --push
 # endif
 # endif
 ifeq ($(findstring centos,$(BASE_OS)),centos)
    DOCKERFILE_PATH ?= ./docker/Dockerfile-centos
 else
    DOCKERFILE_PATH ?= ./docker/Dockerfile-ubuntu
 endif
 #use -f to specify dockerfile
 DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
                            docker $(BUILD) \
                                   --progress=$(BUILD_PROGRESS) \
                                   $(EXTRA_DOCKER_BUILD_FLAGS) \
                                   $(PLATFORMS_FLAG) \
                                   $(PUSH_FLAG) \
                                   -f $(DOCKERFILE_PATH) \
                                   -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
                                   $(BUILD_ARGS) .
                                   # --target $(BUILD_TYPE)
 .PHONY: all
 all: devel-image
 .PHONY: devel-image
 devel-image: BASE_IMAGE := $(BASE_DEVEL)
 devel-image: DOCKER_TAG := torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}-flashatten${FLASH_ATTEN_VERSION}-${BASE_OS}
 devel-image:
 	$(DOCKER_BUILD)
 .PHONY: clean
 clean:
 	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
--- a/docker/Dockerfile-centos
+++ b/docker/Dockerfile-centos
@ -0,0 +1,131 @@
 ARG BASE_IMAGE
 ARG https_proxy
 ARG http_proxy
 ##############################################################################
 # Install the basic environment on centos
 ##############################################################################
 FROM ${BASE_IMAGE} as base
 ARG https_proxy
 ARG http_proxy
 RUN yum install deltarpm -y && yum update -y \
    && yum install -y \
        ca-certificates \
        cmake \
        curl \
        git \
        wget \
        tar \
        m4 \
        bzip2 \
        gcc \
        gcc-c++ \
        file \
        texinfo \
        which
 ##############################################################################
 # Install the conda environment
 ##############################################################################
 FROM base as conda
 ARG PYTHON_VERSION=3.10
 ARG TARGETPLATFORM
 ARG https_proxy
 ARG http_proxy
 RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
         *)              MINICONDA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
 RUN chmod +x ~/miniconda.sh && \
    bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
    /opt/conda/bin/conda clean -ya
 ##############################################################################
 # Install environment dependencies
 ##############################################################################
 FROM conda as dep
 WORKDIR /dep
 ARG https_proxy
 ARG http_proxy
 ARG GMP_VERSION
 ARG MPFR_VERSION
 ARG MPC_VERSION
 RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
    && cd gmp-${GMP_VERSION}/ \
    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
    && cd mpfr-${MPFR_VERSION}/ \
    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
    && cd mpc-${MPC_VERSION}/ \
    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && git clone https://github.com/ninja-build/ninja.git \
    && cd ninja \
    && git checkout release \
    && ./configure.py --bootstrap \
    && mv ./ninja /usr/bin \
    && cd ..
 ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
 ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
 ARG https_proxy
 ARG http_proxy
 ARG GCC_VERSION
 ARG GMP_VERSION
 ARG MPFR_VERSION
 ARG MPC_VERSION
 RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
    && mkdir build \
    && cd build/ \
    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
    && make -j64 && make install
 ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
 ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
 ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
 ENV CC=${GCC_HOME}/bin/gcc
 ENV CXX=${GCC_HOME}/bin/c++
 ##############################################################################
 # Install InternLM development environment, including flash-attention and apex
 ##############################################################################
 FROM dep as intrenlm-dev
 COPY . /InternLM
 WORKDIR /InternLM
 ARG https_proxy
 ARG http_proxy
 ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
 RUN git submodule update --init --recursive \
    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
    && cd /InternLM/third_party/flash-attention \
    && /opt/conda/bin/python setup.py install \
    && cd ./csrc \
    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
    && cd ../xentropy && /opt/conda/bin/pip install -v . \
    && cd ../rotary && /opt/conda/bin/pip install -v . \
    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
    && cd ../../../../ \
    && cd ./third_party/apex \
    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
    && /opt/conda/bin/pip cache purge \
    && rm -rf ~/.cache/pip
--- a/docker/Dockerfile-ubuntu
+++ b/docker/Dockerfile-ubuntu
@ -0,0 +1,112 @@
 ARG BASE_IMAGE
 ARG https_proxy
 ARG http_proxy
 ##############################################################################
 # Install the basic environment on ubuntu
 ##############################################################################
 FROM ${BASE_IMAGE} as base
 ARG https_proxy
 ARG http_proxy
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        cmake \
        curl \
        git \
        wget \
        tar \
        m4 \
        ninja-build
 ##############################################################################
 # Install the conda environment
 ##############################################################################
 FROM base as conda
 ARG PYTHON_VERSION=3.10
 ARG TARGETPLATFORM
 ARG https_proxy
 ARG http_proxy
 RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
         *)              MINICONDA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
 RUN chmod +x ~/miniconda.sh && \
    bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
    /opt/conda/bin/conda clean -ya
 ##############################################################################
 # Install environment dependencies
 ##############################################################################
 FROM conda as dep
 WORKDIR /dep
 ARG https_proxy
 ARG http_proxy
 ARG GCC_VERSION
 ARG GMP_VERSION
 ARG MPFR_VERSION
 ARG MPC_VERSION
 RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
    && cd gmp-${GMP_VERSION}/ \
    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
    && cd mpfr-${MPFR_VERSION}/ \
    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
    && cd mpc-${MPC_VERSION}/ \
    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
    && mkdir build \
    && cd build/ \
    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
    && make -j64 && make install
 ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
 ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
 ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
 ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
 ENV CC=${GCC_HOME}/bin/gcc
 ENV CXX=${GCC_HOME}/bin/c++
 ##############################################################################
 # Install InternLM development environment, including flash-attention and apex
 ##############################################################################
 FROM dep as intrenlm-dev
 COPY . /InternLM
 WORKDIR /InternLM
 ARG https_proxy
 ARG http_proxy
 ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
 RUN git submodule update --init --recursive \
    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
    && cd /InternLM/third_party/flash-attention \
    && /opt/conda/bin/python setup.py install \
    && cd ./csrc \
    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
    && cd ../xentropy && /opt/conda/bin/pip install -v . \
    && cd ../rotary && /opt/conda/bin/pip install -v . \
    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
    && cd ../../../../ \
    && cd ./third_party/apex \
    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
    && /opt/conda/bin/pip cache purge \
    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-centos
+++ b/experiment/Dockerfile-centos
@ -0,0 +1,161 @@
 ARG BASE_IMAGE
 ARG https_proxy
 ARG http_proxy
 ##############################################################################
 # Install the basic environment on centos
 ##############################################################################
 FROM ${BASE_IMAGE} as base
 ARG https_proxy
 ARG http_proxy
 RUN yum install deltarpm -y && yum update -y \
    && yum install -y \
        ca-certificates \
        cmake \
        curl \
        git \
        wget \
        tar \
        m4 \
        bzip2 \
        gcc \
        gcc-c++ \
        file \
        texinfo \
        which
 ##############################################################################
 # Install the conda environment
 ##############################################################################
 FROM base as conda
 ARG PYTHON_VERSION=3.10
 ARG TARGETPLATFORM
 ARG https_proxy
 ARG http_proxy
 RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
         *)              MINICONDA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
 RUN chmod +x ~/miniconda.sh && \
    bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
    /opt/conda/bin/conda clean -ya
 ##############################################################################
 # Install environment dependencies
 ##############################################################################
 FROM conda as dep
 WORKDIR /dep
 ARG https_proxy
 ARG http_proxy
 ARG GMP_VERSION
 ARG MPFR_VERSION
 ARG MPC_VERSION
 RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
    && cd gmp-${GMP_VERSION}/ \
    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
    && cd mpfr-${MPFR_VERSION}/ \
    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
    && cd mpc-${MPC_VERSION}/ \
    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && git clone https://github.com/ninja-build/ninja.git \
    && cd ninja \
    && git checkout release \
    && ./configure.py --bootstrap \
    && mv ./ninja /usr/bin \
    && cd ..
 ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
 ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
 ARG https_proxy
 ARG http_proxy
 ARG GCC_VERSION
 ARG GMP_VERSION
 ARG MPFR_VERSION
 ARG MPC_VERSION
 RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
    && mkdir build \
    && cd build/ \
    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
    && make -j64 && make install
 ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
 ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
 ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
 ENV CC=${GCC_HOME}/bin/gcc
 ENV CXX=${GCC_HOME}/bin/c++
 ##############################################################################
 # Install InternLM development environment, including flash-attention and apex
 ##############################################################################
 FROM dep as intrenlm-dev
 COPY . /InternLM
 WORKDIR /InternLM
 ARG https_proxy
 ARG http_proxy
 ARG PYTORCH_VERSION
 ARG TORCHVISION_VERSION
 ARG TORCHAUDIO_VERSION
 RUN /opt/conda/bin/pip --no-cache-dir install \
    transformers==4.29.2 \
    sentencepiece \
    numpy \
    tqdm \
    psutil \
    packaging \
    pre-commit \
    ninja \
    gputil \
    pytest \
    packaging \
    boto3 \
    botocore \
    torch-scatter \
    pyecharts \
    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
    && /opt/conda/bin/pip --no-cache-dir install \
    --extra-index-url https://download.pytorch.org/whl/cu117 \
    torch==${PYTORCH_VERSION}+cu117 \
    torchvision==${TORCHVISION_VERSION}+cu117 \
    torchaudio==${TORCHAUDIO_VERSION}
 ARG https_proxy
 ARG http_proxy
 ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
 ARG FLASH_ATTEN_TAG
 RUN git submodule update --init --recursive \
    && cd /InternLM/third_party/flash-attention \
    && git checkout ${FLASH_ATTEN_TAG} \
    && /opt/conda/bin/python setup.py install \
    && cd ./csrc \
    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
    && cd ../xentropy && /opt/conda/bin/pip install -v . \
    && cd ../rotary && /opt/conda/bin/pip install -v . \
    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
    && cd ../../../../ \
    && cd ./third_party/apex \
    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
    && /opt/conda/bin/pip cache purge \
    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-ubuntu
+++ b/experiment/Dockerfile-ubuntu
@ -0,0 +1,142 @@
 ARG BASE_IMAGE
 ARG https_proxy
 ARG http_proxy
 ##############################################################################
 # Install the basic environment on ubuntu
 ##############################################################################
 FROM ${BASE_IMAGE} as base
 ARG https_proxy
 ARG http_proxy
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        cmake \
        curl \
        git \
        wget \
        tar \
        m4 \
        ninja-build
 ##############################################################################
 # Install the conda environment
 ##############################################################################
 FROM base as conda
 ARG PYTHON_VERSION=3.10
 ARG TARGETPLATFORM
 ARG https_proxy
 ARG http_proxy
 RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
         *)              MINICONDA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
 RUN chmod +x ~/miniconda.sh && \
    bash ~/miniconda.sh -b -p /opt/conda && \
    rm ~/miniconda.sh && \
    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
    /opt/conda/bin/conda clean -ya
 ##############################################################################
 # Install environment dependencies
 ##############################################################################
 FROM conda as dep
 WORKDIR /dep
 ARG https_proxy
 ARG http_proxy
 ARG GCC_VERSION
 ARG GMP_VERSION
 ARG MPFR_VERSION
 ARG MPC_VERSION
 RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
    && cd gmp-${GMP_VERSION}/ \
    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
    && cd mpfr-${MPFR_VERSION}/ \
    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
    && cd mpc-${MPC_VERSION}/ \
    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
    && make -j64 && make install \
    && cd .. \
    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
    && mkdir build \
    && cd build/ \
    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
    && make -j64 && make install
 ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
 ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
 ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
 ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
 ENV CC=${GCC_HOME}/bin/gcc
 ENV CXX=${GCC_HOME}/bin/c++
 ##############################################################################
 # Install InternLM development environment, including flash-attention and apex
 ##############################################################################
 FROM dep as intrenlm-dev
 COPY . /InternLM
 WORKDIR /InternLM
 ARG https_proxy
 ARG http_proxy
 ARG PYTORCH_VERSION
 ARG TORCHVISION_VERSION
 ARG TORCHAUDIO_VERSION
 RUN /opt/conda/bin/pip --no-cache-dir install \
    transformers==4.29.2 \
    sentencepiece \
    numpy \
    tqdm \
    psutil \
    packaging \
    pre-commit \
    ninja \
    gputil \
    pytest \
    packaging \
    boto3 \
    botocore \
    torch-scatter \
    pyecharts \
    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
    && /opt/conda/bin/pip --no-cache-dir install \
    --extra-index-url https://download.pytorch.org/whl/cu117 \
    torch==${PYTORCH_VERSION}+cu117 \
    torchvision==${TORCHVISION_VERSION}+cu117 \
    torchaudio==${TORCHAUDIO_VERSION}
 ARG https_proxy
 ARG http_proxy
 ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
 ARG FLASH_ATTEN_TAG
 RUN git submodule update --init --recursive \
    && cd /InternLM/third_party/flash-attention \
    && git checkout ${FLASH_ATTEN_TAG} \
    && /opt/conda/bin/python setup.py install \
    && cd ./csrc \
    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
    && cd ../xentropy && /opt/conda/bin/pip install -v . \
    && cd ../rotary && /opt/conda/bin/pip install -v . \
    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
    && cd ../../../../ \
    && cd ./third_party/apex \
    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
    && /opt/conda/bin/pip cache purge \
    && rm -rf ~/.cache/pip
--- a/experiment/README-CN.md
+++ b/experiment/README-CN.md
@ -0,0 +1,25 @@
 ## 实验性环境镜像
 本模块用于测试新版本环境，默认测试新环境 torch=2.0.1，flash-attention=2.1.0。新环境可能具有不稳定性，标准环境安装请参考：[安装文档](../doc/install.md)
 ### 镜像构建及拉取
 构建镜像时请于 InternLM 根目录下执行 docker.Makefile，该文件与标准环境镜像共用，所使用的 Dockerfile 位于 experiment 目录下。也可直接从 https://hub.docker.com/r/internlm/internlm 拉取镜像，命令如下：
 ```bash
 # 构建镜像
 # ubuntu20.04
 make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
 # centos7
 make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
 # 拉取镜像
 # ubuntu20.04
 docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
 # centos7
 docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
 ```
 ### 容器启动
 对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
 ```bash
 docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
 ```
 容器内默认目录即 `/InternLM`，根据[使用文档](../doc/usage.md)即可启动训练。
--- a/experiment/README-EN.md
+++ b/experiment/README-EN.md
@ -0,0 +1,25 @@
 ## Environment Image for experiment
 This module is used to test the new version environment, the default test new environment is torch=2.0.1, flash-attention=2.1.0. The new environment may be unstable, for the standard environment installation please refer to: [installation guide](../doc/en/install.md)
 ### Build and Pull Image
 When building the image, please make docker.Makefile in the InternLM root directory. This Makefile is shared with the standard environment image, and the Dockerfile used is located in the experiment directory. You can also pull the image directly from https://hub.docker.com/r/internlm/internlm, the command is as follows:
 ```bash
 # Build Image
 # ubuntu20.04
 make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
 # centos7
 make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
 # Pull Image
 # ubuntu20.04
 docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
 # centos7
 docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
 ```
 ### Run Container
 For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
 ```bash
 docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
 ```
 The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
--- a/internlm/core/context/init.py
+++ b/internlm/core/context/init.py
@ -7,6 +7,7 @@ from .parallel_context import (
 from .process_group_initializer import (
    Initializer_Data,
    Initializer_Model,
    Initializer_Nettest,
    Initializer_Pipeline,
    Initializer_Tensor,
    Initializer_Zero1,
@ -34,6 +35,7 @@ __all__ = [
    "Initializer_Pipeline",
    "Initializer_Data",
    "Initializer_Zero1",
    "Initializer_Nettest",
    "ProcessGroupInitializer",
    "Initializer_Model",
    "seed",
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@ -143,6 +143,7 @@ class ParallelContext(metaclass=SingletonMeta):
        self.pipeline_parallel_size = 1
        self.tensor_parallel_size = 1
        self.zero1_parallel_size = -1
        self.nettest_parallel_size = 1
        self.num_processes_on_current_node = -1
        self.virtual_pipeline_parallel_size = None
        self.virtual_pipeline_parallel_rank = None
@ -442,6 +443,9 @@ class ParallelContext(metaclass=SingletonMeta):
        # instead, it should be calculated based on other parallel config
        self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
        # the recommended nettest_parallel_size is 32 GPUs
        self.nettest_parallel_size = 32
        if self.zero1_parallel_size <= 0:
            self.zero1_parallel_size = self.data_parallel_size
@ -454,6 +458,7 @@ class ParallelContext(metaclass=SingletonMeta):
            self.pipeline_parallel_size,
            self.tensor_parallel_size,
            self.zero1_parallel_size,
            self.nettest_parallel_size,
        ]
        # run initialization of different process groups
@ -462,6 +467,7 @@ class ParallelContext(metaclass=SingletonMeta):
        initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
        if self.pipeline_parallel_size > 1:
            initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
        for initializer in initializers:
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@ -3,6 +3,7 @@
 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
 import math
 from abc import ABC, abstractmethod
 from enum import Enum
@ -31,6 +32,9 @@ class ParallelMode(Enum):
    # zero1 parallel
    ZERO1 = "zero1"
    # runntime network test
    NETTEST = "nettest"
 class ProcessGroupInitializer(ABC):
    """An object, knowing the parallelism configuration, that initializes parallel groups.
@ -52,6 +56,7 @@ class ProcessGroupInitializer(ABC):
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        zero1_parallel_size: int,
        nettest_parallel_size: int,
    ):
        self.rank = rank
        self.world_size = world_size
@ -59,6 +64,7 @@ class ProcessGroupInitializer(ABC):
        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        self.zero1_parallel_size = zero1_parallel_size
        self.nettest_parallel_size = nettest_parallel_size
        super().__init__()
    @abstractmethod
@ -332,3 +338,52 @@ class Initializer_Zero1(ProcessGroupInitializer):
                    ranks_in_group = ranks
        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
 class Initializer_Nettest(ProcessGroupInitializer):
    """A ProcessGroupInitializer for network test, especailly for NCCL.
    Args:
        rank (int): The rank of current process.
        world_size (int): Size of whole communication world.
        nettest_parallel_size (int): Size of a network test group.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
    def init_dist_group(self, use_cpu: bool = False):
        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
        Returns:
            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
                A Tensor parallelism's information tuple.
        """
        local_rank = None
        ranks_in_group = None
        process_group = None
        cpu_group = None
        group_world_size = None
        mode = ParallelMode.NETTEST
        for i in range(self.num_nettest_group):
            ranks = []
            for j in range(self.nettest_parallel_size):
                rank = i * self.nettest_parallel_size + j
                if rank < self.world_size:
                    ranks.append(rank)
            group = dist.new_group(ranks)
            if use_cpu:
                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
            else:
                group_cpu = None
            if self.rank in ranks:
                local_rank = ranks.index(self.rank)
                group_world_size = len(ranks)
                process_group = group
                cpu_group = group_cpu
                ranks_in_group = ranks
        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@ -30,7 +30,7 @@ def get_tensor_shape():
    if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
        if gpc.config.model.use_flash_attn:
-            if gpc.config.model.sequence_parallel:
+            if gpc.config.parallel.sequence_parallel:
                sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
                tensor_shape = (
                    gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
@ -140,7 +140,7 @@ class PipelineScheduler(BaseScheduler):
            and gpc.get_world_size(ParallelMode.TENSOR) > 1
        )
-        if gpc.config.model.sequence_parallel:
+        if gpc.config.parallel.sequence_parallel:
            self.scatter_gather_tensors = False
        # cache for the batch data
--- a/internlm/core/trainer.py
+++ b/internlm/core/trainer.py
@ -38,6 +38,11 @@ class TrainState:
        # Total step count
        self.total_steps: int = config.data.total_steps
        # resume tensorboard folder, need load from checkpoint or set manually.
        self.resume_tb_folder = config.resume_tb_folder
        self.tensorboard_folder = config.tensorboard_folder
    def init_batch_sampler(self, train_dl):
        # Copy of the batch sampler from the DataLoader
        self.batch_sampler = train_dl.batch_sampler.copy()
@ -73,8 +78,12 @@ class TrainState:
        self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1
        # track the actual updates of sampler when using weighted sampling
-        self.batch_sampler = train_dl.batch_sampler.copy()
+        if hasattr(self, "batch_sampler"):
-        self.batch_sampler_iter = iter(self.batch_sampler)
+            self.batch_sampler = train_dl.batch_sampler.copy()
            self.batch_sampler_iter = iter(self.batch_sampler)
        # resume tensorboard from older tensorboard_folder
        self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
    def state_dict(self):
        return {
@ -83,6 +92,7 @@ class TrainState:
            "num_consumed_tokens": self.num_consumed_tokens,
            "inf_nan_skip_batches": self.inf_nan_skip_batches,
            "step_count": self.step_count,
            "tensorboard_folder": self.tensorboard_folder,
        }
--- a/internlm/data/utils.py
+++ b/internlm/data/utils.py
@ -5,7 +5,7 @@ import torch
 from internlm.core.context import global_context as gpc
-DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
+DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2}
 def get_dataset_type_id(path):
--- a/internlm/initialize/init.py
+++ b/internlm/initialize/init.py
@ -1,9 +1,15 @@
 from .initialize_trainer import initialize_trainer
-from .launch import get_default_parser, launch_from_slurm, launch_from_torch
+from .launch import (
    get_default_parser,
    initialize_distributed_env,
    launch_from_slurm,
    launch_from_torch,
 )
 __all__ = [
    "get_default_parser",
    "initialize_trainer",
    "launch_from_slurm",
    "launch_from_torch",
    "initialize_distributed_env",
 ]
--- a/internlm/initialize/initialize_tensor.py
+++ b/internlm/initialize/initialize_tensor.py
@ -3,16 +3,15 @@
 import math
 import torch
 from torch import Tensor, nn
-def scaled_init_method_normal(sigma, num_layers):
+def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
    std = sigma / math.sqrt(2.0 * num_layers)
    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        return nn.init.normal_(tensor, mean=0.0, std=std)
    return init_
@ -32,3 +31,33 @@ def normal_(mean: float = 0.0, std: float = 1.0):
        return nn.init.normal_(tensor, mean, std)
    return initializer
 def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
    """Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
    std = sigma / math.sqrt(2.0 * num_layers)
    a = math.sqrt(3.0 * std)
    def init_(tensor):
        return nn.init.uniform_(tensor, -a, a)
    return init_
 def uniform_(mean: float = 0.0, std: float = 1.0):
    r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
     .. math::
        \mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
    Args:
        mean (float): the mean of the uniform distribution. Defaults 0.0.
        std (float): the standard deviation of the uniform distribution. Defaults 1.0.
    """
    a = math.sqrt(3.0 * std)
    def initializer(tensor: Tensor):
        return nn.init.uniform_(tensor, mean - a, mean + a)
    return initializer
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -10,6 +10,7 @@ import torch
 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
 from internlm.utils.common import get_master_node
 from internlm.utils.logger import get_logger
 from internlm.utils.storage_manager import init_storage_manager
@ -108,67 +109,100 @@ def args_sanity_check():
        logger.info(f"valid_every: {data.valid_every}")
    # processing the checkpoint config
-    if "enable_save_ckpt" not in gpc.config.ckpt:
+    ckpt = gpc.config.ckpt
-        gpc.config.ckpt._add_item("enable_save_ckpt", False)
+    if "enable_save_ckpt" not in ckpt:
        ckpt._add_item("enable_save_ckpt", False)
-    if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
+    # Saving checkpoint args.
-        gpc.config.ckpt._add_item("checkpoint_every", float("inf"))
+    if ckpt.enable_save_ckpt:
        assert "checkpoint_every" in ckpt, "If enable save checkpoint, must give checkpoint_every in config.data!"
        assert ckpt.checkpoint_every > 0
        assert "save_ckpt_folder" in ckpt, "If enable save checkpoint, must give save_ckpt_folder in config.data!"
-    if "load_optimizer" not in gpc.config.ckpt:
+        if "async_upload" not in ckpt:
-        gpc.config.ckpt._add_item("load_optimizer", True)
+            ckpt._add_item("async_upload", False)  # async defalut is False.
        else:
            if ckpt.async_upload:
                assert "save_ckpt_folder" in ckpt
                if "boto3:" not in ckpt.save_ckpt_folder:
                    if gpc.is_rank_for_log():
                        logger.warning(
                            "Storing ckpt on file system does not support asynchronous storage, will use sync save!"
                        )
                    ckpt.async_upload = False
                else:
                    if "async_upload_tmp_folder" not in ckpt:
                        ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
-    if "save_ckpt_folder" not in gpc.config.ckpt:
+        if not ckpt.async_upload:
-        gpc.config.ckpt._add_item("save_ckpt_folder", None)
+            ckpt._add_item("async_upload_tmp_folder", None)
-    if "load_ckpt_folder" not in gpc.config.ckpt:
+        if "snapshot_ckpt_folder" not in ckpt:
-        gpc.config.ckpt._add_item("load_ckpt_folder", None)
+            ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))
-    if "load_model_only_folder" not in gpc.config.ckpt:
+        if "oss_snapshot_freq" not in ckpt:
-        gpc.config.ckpt._add_item("load_model_only_folder", None)
+            ckpt._add_item("oss_snapshot_freq", float("inf"))  # if oss_snapshot_freq not given, we disable.
    else:
        ckpt._add_item("checkpoint_every", float("inf"))
        ckpt._add_item("oss_snapshot_freq", float("inf"))
        ckpt._add_item("save_ckpt_folder", None)
        ckpt._add_item("async_upload", False)
        ckpt._add_item("async_upload_tmp_folder", None)
        ckpt._add_item("snapshot_ckpt_folder", None)
        ckpt._add_item("snapshot_ckpt_folder", None)
-    if "async_upload" not in gpc.config.ckpt:
+    # Loading checkpoint args.
-        gpc.config.ckpt._add_item("async_upload", False)
+    if "load_model_only_folder" not in ckpt:
        ckpt._add_item("load_model_only_folder", None)
-    if "async_upload_tmp_folder" not in gpc.config.ckpt:
+    if "load_ckpt_folder" not in ckpt:
-        gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
+        ckpt._add_item("load_ckpt_folder", None)
-    if gpc.config.ckpt.async_upload:
+    if "load_optimizer" not in ckpt:
-        assert "save_ckpt_folder" in gpc.config.ckpt
+        ckpt._add_item("load_optimizer", True)
        if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
            if gpc.is_rank_for_log():
                logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!")
            gpc.config.ckpt.async_upload = False
-    if "snapshot_ckpt_folder" not in gpc.config.ckpt:
+    if "stop_file_path" not in ckpt:
-        gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot"))
+        ckpt._add_item("stop_file_path", None)
-    if "oss_snapshot_freq" not in gpc.config.ckpt and gpc.config.ckpt.checkpoint_every != float("inf"):
+    if "load_given_ckpt" not in ckpt:
-        gpc.config.ckpt._add_item("oss_snapshot_freq", gpc.config.ckpt.checkpoint_every / 2)
+        # If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
-        assert gpc.config.ckpt.oss_snapshot_freq > 0
+        # to auto-load latest checkpoint.
        ckpt._add_item("load_given_ckpt", False)
-    assert not (
+    if ckpt.load_given_ckpt:
-        gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
+        # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
-    ), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."
+        if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
            logger.warning(
                "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
 and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
            )
            ckpt.load_model_only_folder = None
    if gpc.is_rank_for_log():
        logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
-        logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
+        logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
-        logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
+        logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
-        logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
+        logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
-        logger.info(f"async_upload: {gpc.config.ckpt.async_upload}")
+        logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")
        if gpc.config.ckpt.async_upload:
            logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}")
    # initialization storage manager
-    init_storage_manager(gpc.config.ckpt)
+    init_storage_manager(ckpt)
    # tensorboard writer config
    if "enable_tb" not in gpc.config:
        gpc.config._add_item("enable_tb", True)
    if "tensorboard_folder" not in gpc.config:
-        gpc.config._add_item("tensorboard_folder", None)
+        gpc.config._add_item(
            "tensorboard_folder", os.environ["tensorboard_folder"] if "tensorboard_folder" in os.environ else None
        )
    if "resume_tb_folder" not in gpc.config:
-        gpc.config._add_item("resume_tb_folder", None)
+        gpc.config._add_item(
            "resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
        )
    if gpc.is_rank_for_log():
        logger.info(f"tensorboard_folder: {gpc.config.tensorboard_folder}")
        logger.info(f"resume_tb_folder: {gpc.config.resume_tb_folder}")
    # cudnn
    torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
@ -191,10 +225,8 @@ def args_sanity_check():
        elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
            gpc.config.model.dtype = torch.float16
        elif gpc.config.model.dtype == "torch.float32":
            assert gpc.config.model.use_flash_attn is False, "when using float32, the use_flash_attn must be False"
            gpc.config.model.dtype = torch.float32
        elif gpc.config.model.dtype == "torch.tf32":
            assert gpc.config.model.use_flash_attn is False, "when using tf32, the use_flash_attn must be False"
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cuda.matmul.allow_tf32 = True
            gpc.config.model.dtype = torch.float32
@ -236,17 +268,32 @@ def args_sanity_check():
    # process the model config
    if "use_flash_attn" not in gpc.config.model:
        gpc.config.model._add_item("use_flash_attn", True)
-    if "sequence_parallel" not in gpc.config.model:
+
-        gpc.config.model._add_item("sequence_parallel", False)
+    # process the parallel config
    if "sequence_parallel" not in gpc.config.parallel:
        gpc.config.parallel._add_item("sequence_parallel", False)
    else:
        assert not (
-            gpc.config.model.sequence_parallel is True and gpc.config.model.use_flash_attn is False
+            gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
        ), "sequence parallel does not support use_flash_attn=False"
    # feishu webhook address for alerting
    if "alert_address" not in gpc.config:
        gpc.config._add_item("alert_address", None)
    optim_ckpt = gpc.config.hybrid_zero_optimizer
    if "zero_overlap_communication" in optim_ckpt:
        # Compatible with the old interfaces.
        optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
    if "overlap_sync_grad" not in optim_ckpt:
        optim_ckpt._add_item("overlap_sync_grad", False)
    if "overlap_sync_param" not in optim_ckpt:
        optim_ckpt._add_item("overlap_sync_param", False)
    if gpc.is_rank_for_log():
        logger.info(
            f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
        )
 def launch(
    config: Union[str, Path, Config, Dict],
@ -293,8 +340,6 @@ def launch(
    # init process groups for different parallel modes from config
    gpc.init_parallel_groups()
    args_sanity_check()
    # set cuda device
    if torch.cuda.is_available():
        # if local rank is not given, calculate automatically
@ -347,7 +392,11 @@ def launch_from_slurm(
    )
-def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024):
+def launch_from_torch(
    config: Union[str, Path, Config, Dict],
    backend: str = "nccl",
    seed: int = 1024,
 ):
    """A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch
@ -375,3 +424,38 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nc
        backend=backend,
        seed=seed,
    )
 def initialize_distributed_env(
    config: str,
    launcher: str = "slurm",
    master_port: int = 8888,
    seed: int = 1024,
    args_check=True,
 ):
    """
    Initialize distributed environment for distributed training.
    Args:
        config (str): Config file path.
        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
        master_port (str): The master port for distributed training. 8888 by default.
        seed (int, optional): Specified random seed for every process. 1024 by default.
    """
    torch.cuda.empty_cache()
    if launcher == "torch":
        launch_from_torch(config=config, seed=seed)
    elif launcher == "slurm":
        launch_from_slurm(
            config=config,
            host=get_master_node(),
            port=master_port,
            seed=seed,
        )
    else:
        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
    if args_check:
        args_sanity_check()
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@ -7,6 +7,7 @@ import rotary_emb
 import torch
 import torch.nn.functional as F
 from einops import rearrange
 from flash_attn.layers.rotary import ApplyRotaryEmb as LegacyApplyRotaryEmb
 from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
 from torch import Tensor, nn
@ -56,7 +57,7 @@ class Embedding1D(nn.Module):
        output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
-        if gpc.config.model.sequence_parallel:
+        if gpc.config.parallel.sequence_parallel:
            output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
        return output
@ -111,6 +112,7 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
 apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
 legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
 legacy_apply_rotary_embed = LegacyApplyRotaryEmb.apply
 class RotaryEmbedding(torch.nn.Module):
@ -135,15 +137,13 @@ class RotaryEmbedding(torch.nn.Module):
        """ """
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
        self.register_buffer("inv_freq", inv_freq)
        self.scale_base = scale_base
-        scale = (
+        self.scale = (
            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
            if scale_base > 0
            else None
        )
        self.register_buffer("scale", scale)
        self._seq_len_cached = 0
        self._cos_cached = None
@ -218,3 +218,15 @@ class RotaryEmbedding(torch.nn.Module):
                self._cos_k_cached[seqlen_offset:],
                self._sin_k_cached[seqlen_offset:],
            )
    def _single_forward(self, x, indexes=0):
        assert self.scale is None
        self._update_cos_sin_cache(x, indexes)
        x = x[None, ...]
        ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
        return ret
    def _single_eval_forward(self, x, seqlen_offset=0):
        assert self.scale is None
        self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
        return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -62,7 +62,7 @@ class ScaleColumnParallelLinear(nn.Linear):
            weight,
            self.bias,
            process_group=self.process_group,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
        )
@ -111,7 +111,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
            weight,
            self.bias,
            process_group=self.process_group,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
        )
@ -173,7 +173,7 @@ class FeedForward(nn.Module):
            hidden_features,
            process_group,
            bias,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            device=device,
            dtype=dtype,
        )
@ -182,7 +182,7 @@ class FeedForward(nn.Module):
            hidden_features,
            process_group,
            bias,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            device=device,
            dtype=dtype,
        )
@ -191,7 +191,7 @@ class FeedForward(nn.Module):
            out_features,
            process_group,
            bias=bias,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            device=device,
            dtype=dtype,
        )
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@ -176,7 +176,7 @@ class AccPerplex:
            res.update(ds_acc)
            res.update(ds_tokens)
-        loss_res = self.loss_with_type_id.get_metric()
+        loss_res = self.loss_with_type_id.get_metric(reset)
        res.update(loss_res)
        return res
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -121,7 +121,7 @@ class PackedFlashBaseLayer1D(nn.Module):
                process_group=gpc.get_group(ParallelMode.TENSOR),
                bias1=False,
                bias2=False,
-                sequence_parallel=gpc.config.model.sequence_parallel,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
                checkpoint_lvl=0,
                heuristic="auto",
                device=device,
@ -294,7 +294,7 @@ class PackedFlashInternLm1D(nn.Module):
                    max_position_embeddings=-1,
                    process_group=gpc.get_group(ParallelMode.TENSOR),
                    padding_idx=None,
-                    sequence_parallel=gpc.config.model.sequence_parallel,
+                    sequence_parallel=gpc.config.parallel.sequence_parallel,
                    device=device,
                    dtype=dtype,
                )
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@ -82,7 +82,7 @@ class MHA(nn.Module):
            3 * embed_dim,
            process_group,
            bias=True,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            **factory_kwargs,
        )  # according to https://spaces.ac.cn/archives/9577
@ -95,7 +95,11 @@ class MHA(nn.Module):
        # output projection always have the bias (for now)
        self.out_proj = RowParallelLinearTorch(
-            embed_dim, embed_dim, process_group, sequence_parallel=gpc.config.model.sequence_parallel, **factory_kwargs
+            embed_dim,
            embed_dim,
            process_group,
            sequence_parallel=gpc.config.parallel.sequence_parallel,
            **factory_kwargs,
        )
        # need to assign tp attribute so that internlm know it is tensor parallel module
        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
@ -128,7 +132,13 @@ class MHA(nn.Module):
            qkv = self.rotary_emb(qkv, **kwargs)
        if inference_params is None:
-            context = self.inner_attn(qkv)
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
                        qkv = qkv.to(torch.bfloat16)
                    context = self.inner_attn(qkv).to(x.dtype)
            else:
                context = self.inner_attn(qkv)
        else:
            q = qkv[:, :, 0]
            assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
@ -160,7 +170,14 @@ class MHA(nn.Module):
        kwargs.pop("indexes")
        if inference_params is None:
-            context = self.inner_attn(qkv, **kwargs)
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
                        qkv = qkv.to(torch.bfloat16)
                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
            else:
                context = self.inner_attn(qkv, **kwargs)
        else:
            raise RuntimeError("Not support this right now")
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -3,6 +3,7 @@
 import math
 from functools import partial
 from itertools import product
 import torch
 import torch.distributed as dist
@ -19,6 +20,7 @@ from internlm.solver.optimizer.store import (
 )
 from internlm.solver.optimizer.utils import (
    DynamicGradScaler,
    ParamBcastSyncHandler,
    flatten,
    get_grad_accumulate_object,
    has_inf_or_nan,
@ -87,9 +89,9 @@ class HybridZeroOptimizer(BaseOptimizer):
        self,
        optimizer: Optimizer,
        cpu_offload=False,
        overlap_broadcast=False,
        grad_scal_cfg: Config = None,
        zero_cfg: Config = None,
        param_bcast_sync_handler: ParamBcastSyncHandler = None,
    ):
        # DynamicGradScaler related args
        if gpc.config.model.dtype is torch.float32:
@ -104,9 +106,10 @@ class HybridZeroOptimizer(BaseOptimizer):
        max_scale = grad_scal_cfg.max_scale
        # Zero related args
        overlap_communication = zero_cfg.zero_overlap_communication
        reduce_bucket_size = zero_cfg.reduce_bucket_size
        clip_grad_norm = zero_cfg.clip_grad_norm
        self._overlap_sync_grad = zero_cfg.overlap_sync_grad
        self._overlap_sync_param = zero_cfg.overlap_sync_param
        super().__init__(optim=optimizer)
@ -127,7 +130,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        self._fp32_flat_param_groups_of_current_rank = dict()
        # communication params
-        self._overlap_communication = overlap_communication
+        # self._overlap_communication = overlap_communication
        self._reduce_bucket_size = reduce_bucket_size
        # gradient scaler
@ -158,7 +161,12 @@ class HybridZeroOptimizer(BaseOptimizer):
            + f"zo-{self._zero_local_rank}.pt"
        )
        self.params_per_rank_id_dict = []
-        self.overlap_broadcast = overlap_broadcast
+        self._param_bcast_sync_handler = param_bcast_sync_handler
        if self._overlap_sync_param:
            assert self._param_bcast_sync_handler is not None
            self._broadcast_comm_stream = torch.cuda.Stream()
        else:
            self._broadcast_comm_stream = torch.cuda.current_stream()
        # iterate over the param group in the optimizer
        # partition these param groups for data parallel training
@ -228,12 +236,14 @@ class HybridZeroOptimizer(BaseOptimizer):
        # initialize communication stream for
        # communication-computation overlapping
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._comm_stream = torch.cuda.Stream()
        else:
            self._comm_stream = torch.cuda.current_stream()
        # reduction hook is only used if overlapping communication
        # if it is stage 1 without overlapping, no hook will be attached
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._attach_reduction_hook()
    @property
@ -267,8 +277,10 @@ class HybridZeroOptimizer(BaseOptimizer):
            global_id = str(i)
            for j in range(len(param.size())):
                global_id = "_".join([global_id, str(param.size()[j])])
-
+            if self._overlap_sync_param:
-            rank_to_go = numel_per_rank.index(min(numel_per_rank))
+                rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
            else:
                rank_to_go = numel_per_rank.index(min(numel_per_rank))
            params_per_rank[rank_to_go].append(param)
            self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
            numel_per_rank[rank_to_go] += param.numel()
@ -299,7 +311,9 @@ class HybridZeroOptimizer(BaseOptimizer):
                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)
                        reduction_func = partial(
-                            self._store_and_try_reduce_grads_by_bucket, param=param, reduce_rank=reduce_rank
+                            self._store_and_try_reduce_grads_by_bucket,
                            param=param,
                            reduce_rank=reduce_rank,
                        )
                        # define hook
@ -384,17 +398,17 @@ class HybridZeroOptimizer(BaseOptimizer):
                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
-            stream = self._comm_stream
+            self._comm_stream.synchronize()
            stream.synchronize()
            self._param_store.clear_grads_of_previous_reduced_params()
        else:
            stream = torch.cuda.current_stream()
-        with torch.cuda.stream(stream):
+        with torch.cuda.stream(self._comm_stream):
            flat = bucket.flatten()
            reduced_flat = reduce_tensor(
-                tensor=flat, dtype=self.dtype, dst_rank=reduce_rank, parallel_mode=ParallelMode.DATA
+                tensor=flat,
                dtype=self.dtype,
                dst_rank=reduce_rank,
                parallel_mode=ParallelMode.DATA,
            )
            # update the reduced tensor
@ -483,6 +497,7 @@ class HybridZeroOptimizer(BaseOptimizer):
            grads = [self.padding_grad]
            params = [self.padding_tensor]
        norm = 0
        if self._clip_grad_norm > 0:
            # this norm is before scaling, it will be very large
            norm = compute_norm(
@ -507,7 +522,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        # if not overlapping communication (no reduction hook is attached)
        # we need to manually reduce these gradients
-        if not self._overlap_communication:
+        if not self._overlap_sync_grad:
            for group_id in range(len(self._fp16_param_groups)):
                for param in self._fp16_param_groups[group_id]:
                    if param.grad is not None:
@ -522,18 +537,21 @@ class HybridZeroOptimizer(BaseOptimizer):
            groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
        # clear reduced grads
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            # grads in the last bucket is reduced
            self._comm_stream.synchronize()
            self._param_store.clear_grads_of_previous_reduced_params()
        # compute norm for gradients in the last bucket
-        total_norms = []
+        total_norms = {}
        for group_id in range(self.num_param_groups):
-            total_norms.append(
+            group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
-                self._compute_norm_with_stage(
+            group_name = f"{group_id}_{group_name}"
-                    group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id]
+            total_norms[group_name] = self._compute_norm_with_stage(
-                )
+                group_id=group_id,
                last_bucket=True,
                last_stage=True,
                previous_norm=groups_norms[group_id],
            )
        timer("sync_grad").start()
@ -552,7 +570,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        # found_inf = self._check_overflow()
        # Because you may encounter inf when computing norm
-        if -1 in norms:
+        if -1 in norms.values():
            found_inf = True
        loss_scale = float(self.loss_scale.item())  # backup
@ -562,10 +580,13 @@ class HybridZeroOptimizer(BaseOptimizer):
        if found_inf:
            if gpc.is_rank_for_log():
                logger.warning("Overflow occurs, please check it.")
-                send_alert_message(address=gpc.config.alert_address, message="Overflow occurs, please check it.")
+                send_alert_message(
                    address=gpc.config.alert_address,
                    message="Overflow occurs, please check it.",
                )
            self._grad_store._averaged_gradients = dict()
            self.zero_grad()
-            return False, None
+            return False, norms
        # copy the grad of fp16 param to fp32 param
        single_grad_partition_groups = []
@ -597,15 +618,17 @@ class HybridZeroOptimizer(BaseOptimizer):
        # unscale and clip grads
        # get the global norm
-        global_norm_groups = []
+        global_norm_groups = {}
        if self._clip_grad_norm > 0:
-            for norm in norms:
+            for group_name, norm in norms.items():
-                global_norm_groups.append(norm**0.5)
+                global_norm_groups[group_name] = norm**0.5
        # the following operations are performed only on the rank to which parameters are assigned.
        if gpc.config.model.dtype is not torch.float32:
-            if len(single_grad_partition_groups) != 0:
+            if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
-                self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
+                self._unscale_and_clip_grads(
                    single_grad_partition_groups, list(global_norm_groups.values()), loss_scale
                )
        # update the parameters
        timer("step").start()
@ -625,35 +648,42 @@ class HybridZeroOptimizer(BaseOptimizer):
                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                    fp16_param.data.copy_(fp32_param)
-        # TODO: support broadcast overlap
+        with torch.cuda.stream(self._broadcast_comm_stream):
-        self.broadcast_params(overlap=False)
+            self.broadcast_params()
        timer("step").stop()
        # update gradients may not be needed here, because the sync_params function is used in initialization,
        # so synchronization is maintained
-        return True, [global_norm / loss_scale for global_norm in global_norm_groups]
+        for group_name, global_norm in global_norm_groups.items():
            global_norm_groups[group_name] = global_norm / loss_scale
        return True, global_norm_groups
-    def broadcast_params(self, overlap=False):
+    def broadcast_params(self):
        handles = []
-        for group_id in range(self.num_param_groups):
+        for rank, group_id in product(range(self._zero_world_size), range(self.num_param_groups)):
-            for rank in range(self._zero_world_size):
+            # The following operations are performed only on the rank to which parameters are assigned.
-                # The following operations are performed only on the rank to which parameters are assigned.
+            if rank in self.param_group_no_params_ranks[group_id]:
-                if rank not in self.param_group_no_params_ranks[group_id]:
+                continue
-                    fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
+            fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
-                    # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
+            # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
-                    # assert grank == rank, f"{grank} == {rank}"
+            # assert grank == rank, f"{grank} == {rank}"
-                    g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode)[rank]
+            g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode)[rank]
-                    handle = dist.broadcast(
+            handle = dist.broadcast(
-                        fp16_param, src=g_rank, group=gpc.get_group(ParallelMode.ZERO1), async_op=True
+                fp16_param,
-                    )
+                src=g_rank,
-                    handles.append(handle)
+                group=gpc.get_group(ParallelMode.ZERO1),
                async_op=True,
            )
-        if not overlap:
+            if self._overlap_sync_param:
-            for handle in handles:
+                self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
-                handle.wait()
+            else:
-        else:
+                handles.append(handle)
-            return handles
+
        for handle in handles:
            handle.wait()
    ##################
    # FP16 Utilities #
@ -671,7 +701,11 @@ class HybridZeroOptimizer(BaseOptimizer):
                    if avg_grad is not None and has_inf_or_nan(avg_grad):
                        self._found_overflow.fill_(1.0)
                        break
-        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.GLOBAL))
+        dist.all_reduce(
            self._found_overflow,
            op=dist.ReduceOp.MAX,
            group=gpc.get_group(ParallelMode.GLOBAL),
        )
        return self._found_overflow.item() > 0
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@ -3,15 +3,18 @@
 import math
 from abc import ABC, abstractmethod
-from typing import Dict, Optional
+from collections import OrderedDict
 from functools import partial
 from typing import Any, Dict, Optional, Union
 import torch
 import torch.distributed as dist
-from torch import Tensor
+from torch import Tensor, nn
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_tensor_norm, move_norm_to_cuda
 from internlm.utils.logger import get_logger
 from internlm.utils.parallel import is_model_parallel_parameter
@ -60,12 +63,19 @@ def get_grad_accumulate_object(tensor):
 def split_half_float_double(tensor_list):
-    dtypes = ["torch.cuda.HalfTensor", "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor", "torch.cuda.BFloat16Tensor"]
+    dtype_buckets = {
-    buckets = []
+        "torch.cuda.HalfTensor": [],
-    for _, dtype in enumerate(dtypes):
+        "torch.cuda.FloatTensor": [],
-        bucket = [t for t in tensor_list if t.type() == dtype]
+        "torch.cuda.DoubleTensor": [],
-        if bucket:
+        "torch.cuda.BFloat16Tensor": [],
-            buckets.append(bucket)
+    }
    for t in tensor_list:
        dtype = t.type()
        if dtype in dtype_buckets:
            dtype_buckets[dtype].append(t)
    buckets = [bucket for bucket in dtype_buckets.values() if bucket]
    return buckets
@ -184,7 +194,10 @@ def calc_l2_norm(grads):
        if APEX_AVAILABLE:
            dummy_overflow_buf = torch.cuda.IntTensor([0])
            norm, _ = multi_tensor_applier(
-                amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads], False  # no per-parameter norm
+                amp_C.multi_tensor_l2norm,
                dummy_overflow_buf,
                [grads],
                False,  # no per-parameter norm
            )
        else:
            norm, _ = multi_tensor_l2norm_torch(grads, False)
@ -228,7 +241,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
        # Take max across all model-parallel GPUs.
        if gpc.get_world_size(ParallelMode.MODEL) > 1:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.MODEL))
+            dist.all_reduce(
                total_norm_cuda,
                op=dist.ReduceOp.MAX,
                group=gpc.get_group(ParallelMode.MODEL),
            )
        total_norm = total_norm_cuda[0].item()
    else:
        tensor_parallel_grads = []
@ -280,7 +297,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
        # Sum across all model-parallel GPUs.
        if gpc.is_initialized(ParallelMode.MODEL):
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL))
+            dist.all_reduce(
                total_norm,
                op=dist.ReduceOp.SUM,
                group=gpc.get_group(ParallelMode.MODEL),
            )
        # This is because we use zero1, so we need to use this reduction.
        # TODO: Check zero group to be a subset of dp group.
@ -459,3 +480,90 @@ class DynamicGradScaler(BaseGradScaler):
        self._scale = self._scale.fill_(state_dict["_scale"])
        self._growth_step = state_dict["_growth_step"]
        self._hysteresis_step = state_dict["_hysteresis_step"]
 class ParamBcastSyncHandler:
    """
    Model Partition Handler for overlap broadcast with forward
    """
    def __init__(self, model: Union[nn.Module, nn.ModuleList]) -> None:
        self._block_to_param = OrderedDict()  # <key: nn.Module> <value: list(param)>
        self._param_to_rank = dict()  # <key: param> <value: rank)>
        self._block_to_rank = dict()  # <key: nn.Module> <value: rank)>
        self._bcast_handles = dict()  # <key: rank> <value: list(bcast handles))>
        zero1_size = gpc.get_world_size(ParallelMode.ZERO1)
        total_param_num = sum(p.numel() for p in model.parameters())
        avg_param_num = total_param_num * 1.0 // zero1_size
        # just want to share same for loop for ModuleList and Module
        if not isinstance(model, nn.ModuleList):
            model = [model]
        # record the parameters to transformer/embeding/head/norm block
        for _chunk in model:
            if isinstance(_chunk, NaiveAMPModel):
                _chunk = _chunk.model
            for _, children in _chunk.named_children():
                # should be the transformer block definaton in modeling_xxx.py
                if isinstance(children, nn.ModuleList):
                    # record the block that a parameter belongs to
                    for _, block in enumerate(children):
                        # self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
                        self._block_to_param[block] = list(block.parameters())
                else:
                    # record the block that a parameter belongs to
                    # self._block_to_param[name] = list(children.parameters())
                    self._block_to_param[children] = list(children.parameters())
        alloc_num = 0
        rank_to_go = 0
        # process the parameters in block_to_param sequencially,
        # allocate each parameter to a local rank of ParallelMode.ZERO1,
        # NOTE that we do NOT consider following scenarios:
        # 1) whether a parameter is trainable;
        # 2) paramters maybe in different optimizer group
        for block, params in self._block_to_param.items():
            # allocate a model block to a local rank of ParallelMode.ZERO1
            self._block_to_rank[block] = [rank_to_go]
            for p in params:
                alloc_num = alloc_num + p.numel()
                # in this case, allocate the param to next rank if possible
                if alloc_num > avg_param_num * 1.01 and rank_to_go < zero1_size - 1:
                    rank_to_go = rank_to_go + 1
                    alloc_num = 0
                    self._block_to_rank[block].append(rank_to_go)
                # allocate a parameter to a local rank of ParallelMode.ZERO1
                self._param_to_rank[p] = rank_to_go
        # initialize an empty list for _bcast_handles of each rank
        for rank in range(gpc.get_world_size(ParallelMode.ZERO1)):
            self._bcast_handles[rank] = []
        # register_forward_pre_hook for transformer/embeding/norm/xxx block
        self._register_sync_parameters_hook()
    def _register_sync_parameters_hook(self) -> None:
        def _pre_forward_hook(model: nn.Module, inputs: Any):  # pylint: disable=W0613
            bcast_handles = []
            # gather all required broadcast hanles into a list
            for rank in self._block_to_rank[model]:
                bcast_handles.extend(self._bcast_handles[rank])
                # need to clear _bcast_handles since they would be processed later
                self._bcast_handles[rank] = []
            # wait all required broadcast handles to be completed
            for handle in bcast_handles:
                handle.wait()
        # register_forward_pre_hook for transformer/embeding/norm/xxx block
        for block, _ in self._block_to_rank.items():
            block.register_forward_pre_hook(partial(_pre_forward_hook))
    def get_rank_by_param(self, param) -> int:
        return self._param_to_rank[param]
    def add_bcast_handle(self, rank, handle) -> None:
        self._bcast_handles[rank].append(handle)
--- a/internlm/train/init.py
+++ b/internlm/train/init.py
@ -0,0 +1,19 @@
 from .training_internlm import (
    get_train_data_loader,
    get_validation_data_loader,
    initialize_llm_profile,
    initialize_model,
    initialize_optimizer,
    load_new_batch,
    record_current_batch_training_metrics,
 )
 __all__ = [
    "get_train_data_loader",
    "get_validation_data_loader",
    "initialize_llm_profile",
    "initialize_model",
    "initialize_optimizer",
    "load_new_batch",
    "record_current_batch_training_metrics",
 ]
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -0,0 +1,422 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import time
 from functools import partial
 from typing import Callable, Iterable, Union
 import torch
 import torch.distributed as dist
 from torch import nn
 from torch.utils.data import ConcatDataset, DataLoader
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.trainer import TrainState
 from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
 from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
 from internlm.data.dataset import get_dataset_dict
 from internlm.data.dummy_dataset import RandomDataset
 from internlm.data.packed_dataset import (
    PackedDataset,
    PackedDatasetWithoutCuSeqlen,
    get_packed_dataset_without_short_length,
 )
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.monitor import set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
 from internlm.solver.optimizer import HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.utils.common import DummyProfile
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
    is_no_pp_or_last_stage,
    sync_model_param,
    sync_model_param_within_tp,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 logger = get_logger(__file__)
 def initialize_model():
    """
    Initialize model.
    Returns: The neural network model to be trained or evaluated.
    """
    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
    if isinstance(model, nn.ModuleList):
        model = nn.ModuleList(
            [
                NaiveAMPModel(
                    model=_m,
                    output_to_fp32=False,  # manually controlled by interleaved pipleline scheduler
                    dtype=gpc.config.model.get("dtype", torch.half),
                    sync_buffer=False,
                )
                for _m in model
            ]
        )
    else:
        model = NaiveAMPModel(
            model=model,
            output_to_fp32=is_no_pp_or_last_stage(),
            dtype=gpc.config.model.get("dtype", torch.half),
            sync_buffer=False,
        )
    # This sync is very important, cause the model weights kept in optimizer are copied
    # from the origin parameters in the memory, so we should make sure the dp sync
    # does not influence the model weights in optimizer be different with the origin parameters.
    sync_model_param(model, parallel_mode=ParallelMode.DATA)
    # This function is needed to make sure parameters that are not splitted by tensor parallelism are
    # the same across tensor parallelism.
    sync_model_param_within_tp(model)
    return model
 def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    """
    Initialize optimizer.
    Args:
        model (torch.nn.Module): Your model instance to be trained or evaluated.
    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
        param_bcast_sync_handler = ParamBcastSyncHandler(model)
    else:
        param_bcast_sync_handler = None
    adam_cfg = gpc.config.adam
    naive_optimizer = torch.optim.AdamW(
        params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
        lr=adam_cfg.lr,
        betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
        eps=adam_cfg.adam_eps,
    )
    optimizer = HybridZeroOptimizer(
        naive_optimizer,
        grad_scal_cfg=gpc.config.grad_scaler,
        zero_cfg=gpc.config.hybrid_zero_optimizer,
        param_bcast_sync_handler=param_bcast_sync_handler,
    )
    beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
    lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
    return optimizer, beta2_scheduler, lr_scheduler
 def get_train_data_loader(
    num_worker: int = 0, dataset_generate_func: Callable = None, train_sampler=None, train_collate_fn=None
 ):
    """
    Generate and return the training data loader.
    Returns: A tuple of (train_dl, dataset_types).
    """
    # Get the dataset types
    dataset_types = None
    dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
    data_cfg = gpc.config.data
    # Get the sample weight dictionary
    train_folder = data_cfg.train_folder
    if not train_folder:
        train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
        if data_cfg.pack_sample_into_one:
            train_ds = PackedDatasetWithoutCuSeqlen(
                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
            )
        else:
            train_ds = PackedDataset(
                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
            )
    else:
        if dataset_generate_func is not None:
            train_ds = dataset_generate_func()
        else:
            train_ds = get_packed_dataset_without_short_length(
                folder=data_cfg.train_folder,
                packed_length=data_cfg.packed_length,
                max_length_per_sample=data_cfg.seq_len,
                show_progress=dist.get_rank() == 0,
                min_length=data_cfg.min_length,
                min_length_dict=data_cfg.get("min_length_dict", {}),
                pack_into_one_sample=data_cfg.pack_sample_into_one,
            )
    if dataset_generate_func is None or not train_folder:
        # partition already completed
        assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen, ConcatDataset))
        # Create the training dataset sampler
        train_sampler = StaticBatchSampler(
            train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
            batch_size=data_cfg.micro_num,
            rampup_batch_size=data_cfg.rampup_batch_size,
            micro_bsz=data_cfg.micro_bsz,
            seed=1024,
            drop_last=True,
            data_rank=gpc.get_local_rank(ParallelMode.DATA),
            data_world_size=gpc.get_world_size(ParallelMode.DATA),
        )
    if dataset_generate_func is None or not train_folder:
        train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
    # Create the training data loader
    train_dl = DataLoader(
        dataset=train_ds,
        batch_sampler=train_sampler,
        num_workers=num_worker,
        pin_memory=True,
        collate_fn=train_collate_fn,
        persistent_workers=num_worker > 0,
    )
    return train_dl, dataset_types
 def get_validation_data_loader(
    num_worker: int = 0, dataset_generate_func: Callable = None, val_collate_fn=None, dataloader_func=None
 ):
    """Generate and return the validation data loader."""
    data_cfg = gpc.config.data
    if not data_cfg.valid_folder:
        val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
    else:
        if dataset_generate_func is not None:
            assert val_collate_fn and dataloader_func is not None
            val_ds = dataset_generate_func()
        else:
            val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
    if not isinstance(val_ds, dict):
        val_ds = {"val": val_ds}
    if val_collate_fn is None or not data_cfg.valid_folder:
        val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
    val_dls = {}
    for val_name, ds in val_ds.items():
        if dataloader_func and data_cfg.valid_folder is not None:
            val_dls[val_name] = dataloader_func(dataset=ds, collate_fn=val_collate_fn)
            if gpc.is_rank_for_log():
                logger.info(
                    f"load validation dataset {val_name} with valid batch size {str(data_cfg.valid_micro_num)} and "
                    f"{ds.size} Byte samples."
                )
        else:
            # making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
            # otherwise too much data may be dropped
            batch_size = min(
                data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
            )
            batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
            if batch_size == 0 and gpc.is_rank_for_log():
                logger.info(f"skip validate {val_name}.")
                continue
            val_dls[val_name] = get_dpsampler_dataloader(
                ds,
                shuffle=False,
                num_workers=num_worker,
                batch_size=batch_size,
                collate_fn=val_collate_fn,
                drop_last=True,
            )  # drop_last=True, otherwise it may cause problems in the last batch
            if gpc.is_rank_for_log():
                logger.info(
                    f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
                    f"samples {str(len(val_dls[val_name]))}."
                )
    return val_dls
 def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
    """
    Load and return the new batch data based on training data loader.
    Args:
        train_dl (torch.utils.data.DataLoader): Dataloader for training.
        train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
        train_state (TrainState): Current training state.
    Returns: A batch data and the updated train_iter.
    """
    timer("batch-gen").start()
    try:
        batch = next(train_iter)  # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
        if hasattr(train_state, "batch_sampler_iter"):
            next(train_state.batch_sampler_iter)
    except StopIteration:
        train_iter = iter(train_dl)
        batch = next(train_iter)
        train_state.num_consumed_samples_in_epoch = 0
        if hasattr(train_state, "batch_sampler"):
            train_state.batch_sampler_iter = iter(train_state.batch_sampler)
            next(train_state.batch_sampler_iter)
    timer("batch-gen").stop()
    if batch[0].get("type_ids", None) is not None:
        # if use_flash_attn is False, we need to unpack type_ids
        if not gpc.config.model.use_flash_attn:
            batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
    return batch, train_iter
 def initialize_llm_profile(profiling: bool = False, start_time: str = None):
    """Initialize and return the profiler context manager instance."""
    if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
        llm_profile = torch.profiler.profile
        logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
    else:
        llm_profile = DummyProfile
    return llm_profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
            + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
            + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
        ),
        with_stack=True,
        with_modules=True,
    )
 def record_current_batch_training_metrics(
    get_tflops_func,
    logger,
    writer,
    success_update,
    batch_count,
    batch,
    train_state,
    optimizer,
    beta2_scheduler,
    trainer,
    start_time,
    loss,
    grad_norm,
    metric,
    update_panel,
 ):
    """
    Print some training metrics of current batch.
    """
    set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
    if success_update in (0, True):
        train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
    if is_no_pp_or_last_stage():
        acc_perplex = metric.get_metric()
    if success_update and gpc.is_rank_for_log():
        lr = optimizer.param_groups[0]["lr"]
        if hasattr(trainer.engine.optimizer, "grad_scaler"):
            scaler = trainer.engine.optimizer.grad_scaler._scale.item()
        elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
            scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
        num_tokens_in_batch = batch[1].nelement()
        num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
        max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        tk_per_gpu = 0
        tk_per_gpu = round(
            num_tokens_in_batch
            * gpc.get_world_size(ParallelMode.DATA)
            / gpc.get_world_size(ParallelMode.GLOBAL)
            / (time.time() - start_time),
            2,
        )
        tflops = get_tflops_func((time.time() - start_time))
        infos = {
            "tflops": tflops,
            "step": batch_count,
            "loss": loss.item(),
            "tgs (tokens/gpu/second)": tk_per_gpu,
            "lr": lr,
            "loss_scale": scaler,
            "grad_norm": grad_norm,
        }
        infos["micro_num"] = len(batch[1])
        infos["num_consumed_tokens"] = train_state.num_consumed_tokens
        infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
        infos["num_samples_in_batch"] = num_samples_in_batch  # the number of batches which have the most samples
        infos["largest_length"] = max_length_in_batch  # the longest input
        infos["largest_batch"] = max_samples_in_batch  # the batch with the most samples
        infos["smallest_batch"] = min_samples_in_batch
        infos["adam_beta2"] = beta2_scheduler.get_beta2()
        fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
        infos["fwd_bwd_time"] = fwd_bwd_time
        for key, value in acc_perplex.items():
            infos[key] = value
        line = ""
        for key, value in infos.items():
            line += f"{key}={value} "
            if isinstance(value, dict):
                writer.add_scalars(key=key, value=value, step=train_state.step_count)
            else:
                writer.add_scalar(key=key, value=value, step=train_state.step_count)
        if update_panel:
            # metrics shown with dashboard panels
            panel_metrics = {
                "step": batch_count,
                "lr": lr,
                "num_consumed_tokens": train_state.num_consumed_tokens,
                "loss": loss.item(),
                "flops": tflops,
                "tgs": tk_per_gpu,
                "acc": acc_perplex["acc"],
                "perplexity": acc_perplex["perplexity"],
                "fwd_bwd_time": fwd_bwd_time,
            }
            for norm_key, norm_value in grad_norm.items():
                panel_metrics[norm_key] = norm_value
            logger.info(
                "{line}",
                line=line,
                extra=panel_metrics,
            )
        else:
            logger.info(line)
        # if loss spike occurs, send alert info to feishu
        mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@ -52,12 +52,12 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
@contextmanager
 def switch_sequence_parallel_mode():
-    prev_mode = gpc.config.model.sequence_parallel
+    prev_mode = gpc.config.parallel.sequence_parallel
    try:
-        gpc.config.model.sequence_parallel = False
+        gpc.config.parallel.sequence_parallel = False
        yield
    finally:
-        gpc.config.model.sequence_parallel = prev_mode
+        gpc.config.parallel.sequence_parallel = prev_mode
 def evaluate_on_val_dls(
@ -67,6 +67,7 @@ def evaluate_on_val_dls(
    logger,
    step_count,
    update_panel: bool = False,
    streaming: bool = False,
 ):
    with switch_sequence_parallel_mode():
        torch.cuda.empty_cache()
@ -75,7 +76,7 @@ def evaluate_on_val_dls(
        data_cfg = gpc.config.data
        for val_name, val_dl in val_dls.items():
-            if len(val_dl) == 0 and verbose:
+            if not streaming and len(val_dl) == 0 and verbose:
                logger.info(f"Validation dataset: {val_name} is empty")
                continue
@ -91,7 +92,7 @@ def evaluate_on_val_dls(
            for val_idx, batch in tqdm(
                enumerate(val_dl),
                desc="Val.",
-                total=len(val_dl),
+                total=len(val_dl) if not streaming else None,
                position=1,
                disable=not verbose,
                leave=False,
@ -135,7 +136,7 @@ def evaluate_on_val_dls(
            dist.barrier()
            val_res = val_metric.get_metric()
-            if verbose and len(val_dl) != 0:
+            if verbose and (streaming or len(val_dl) != 0):
                val_loss = val_loss / (val_idx + 1 + 1e-6)
                infos = {
                    "step": step_count,
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@ -0,0 +1,163 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import math
 import socket
 import torch
 import torch.distributed as dist
 from flash_attn.modules.mha import FlashSelfAttention, SelfAttention
 from torch.utils import benchmark
 from internlm.utils.logger import get_logger
 try:
    import GPUtil
    import psutil
 except ImportError:
    GPUtil, psutil = None, None
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.utils.common import get_current_device
 logger = get_logger(__file__)
 def benchmark_forward(
    test_fn,
    *inputs,
    repeats=100,
    amp=True,
    amp_dtype=torch.float16,
    **kwinputs,
 ):
    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
    def amp_wrapper(*inputs, **kwinputs):
        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
            test_fn(*inputs, **kwinputs)
    bench_timer = benchmark.Timer(
        stmt="test_fn_amp(*inputs, **kwinputs)",
        globals={"test_fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
        num_threads=torch.get_num_threads(),
    )
    used_time = bench_timer.timeit(repeats)
    return used_time.mean
 def flops(batch, seqlen, headdim, nheads, time_f):
    """Compute the flops value of a GPU with give flashattention function"""
    flop = 4 * batch * seqlen**2 * nheads * headdim
    return (flop / time_f / 10**12) if not math.isnan(time_f) else 0.0
 def get_gpu_temperature():
    """Get current GPU temperature."""
    try:
        gpu_id = torch.cuda.current_device()
    except AssertionError:
        gpu_id = -1
    if GPUtil is not None and gpu_id >= 0:
        gpus = GPUtil.getGPUs()
        gpu_temperature = gpus[gpu_id].temperature
    else:
        gpu_temperature = -1
    return gpu_temperature
 def get_cpu_temperature():
    """Get current CPU temperature."""
    if psutil is not None:
        cpu_temperature = psutil.sensors_temperatures()["coretemp"][0].current
    else:
        cpu_temperature = -1
    return cpu_temperature
 def bench_net():
    """Benchmark nccl performance for slow node detection."""
    if gpc.get_world_size(ParallelMode.GLOBAL) <= 1:
        return
    if gpc.is_rank_for_log():
        logger.info("benchmarking network speed ...")
    repeats = 100
    input_data = torch.randn(
        8 * 1024 * 1024,
        device=get_current_device(),
        dtype=torch.bfloat16,
    )
    def allreduce_fn(inputs):
        dist.all_reduce(inputs, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.NETTEST))
    bench_timer = benchmark.Timer(
        stmt="test_fn_amp(inputs)",
        globals={"test_fn_amp": allreduce_fn, "inputs": input_data},
        num_threads=torch.get_num_threads(),
    )
    allreduce_time = bench_timer.timeit(repeats).mean
    allreduce_time = allreduce_time * 10**3
    allreduce_time_this = allreduce_time
    allreduce_time = torch.Tensor([allreduce_time]).to(device=get_current_device())
    dist.all_reduce(allreduce_time, group=gpc.get_group(ParallelMode.GLOBAL))
    allreduce_time_avg = allreduce_time / gpc.get_world_size(ParallelMode.GLOBAL)
    allreduce_time_avg = float(allreduce_time_avg.item())
    if allreduce_time_this >= allreduce_time_avg * 1.05:
        logger.warning(
            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} NCCL test is slower than avg, "
            f"Hostname {socket.gethostname()}, "
            f"allreduce_time {allreduce_time_this:.2f}, avg {allreduce_time_avg:.2f}, "
            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
        )
 def bench_gpu(use_flash_attn=True):
    """Benchmark single GPU performance for slow node detection."""
    if gpc.is_rank_for_log():
        logger.info("benchmarking gpu speed ...")
    headdim = 64
    dim = 2048
    batch_size, seqlen = 2, 1024
    nheads = dim // headdim
    inner_attn = FlashSelfAttention if use_flash_attn else SelfAttention
    inner_attn = inner_attn(causal=True, softmax_scale=None, attention_dropout=0)
    qkv = torch.randn(
        batch_size,
        seqlen,
        3,
        dim // headdim,
        headdim,
        device=get_current_device(),
        dtype=torch.float16,
        requires_grad=True,
    )
    time_f = benchmark_forward(inner_attn, qkv)
    speed = flops(batch_size, seqlen, headdim, nheads, time_f)
    speed_this = speed
    speed = torch.Tensor([speed]).to(device=get_current_device())
    dist.all_reduce(speed, group=gpc.get_group(ParallelMode.GLOBAL))
    speed_avg = speed / gpc.get_world_size(ParallelMode.GLOBAL)
    speed_avg = float(speed_avg.item())
    if speed_this <= speed_avg * 0.95:
        logger.warning(
            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} GPU is slower than avg, "
            f"Hostname {socket.gethostname()}, "
            f"tflops {speed_this:.2f}, avg {speed_avg:.2f}, "
            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
        )
--- a/internlm/utils/megatron_timers.py
+++ b/internlm/utils/megatron_timers.py
@ -14,18 +14,19 @@ class _Timer:
        self.elapsed_ = 0.0
        self.started_ = False
        self.start_time = time.time()
        self.stream = torch.cuda.current_stream()
    def start(self):
        """Start the timer."""
        assert not self.started_, "timer has already been started"
-        torch.cuda.synchronize()
+        self.stream.synchronize()
        self.start_time = time.time()
        self.started_ = True
    def stop(self):
        """Stop the timer."""
        assert self.started_, "timer is not started"
-        torch.cuda.synchronize()
+        self.stream.synchronize()
        self.elapsed_ += time.time() - self.start_time
        self.started_ = False
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@ -2,7 +2,9 @@
 # -*- encoding: utf-8 -*-
 import copy
 import fcntl
 import os
 import socket
 import time
 from enum import Enum
 from typing import Dict
@ -12,6 +14,7 @@ import torch
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.trainer import TrainState
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer import HybridZeroOptimizer
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
@ -25,8 +28,6 @@ from internlm.utils.storage_manager import (
 logger = get_logger(__file__)
 quit_signal_handler = None
 class CheckpointType(Enum):
    NORMAL_CHECKPOINT = 1
@ -167,44 +168,6 @@ def save_optimizer_checkpoint(optim, state_path):
        llm_save(os.path.join(state_path, fp), states)
 def save_checkpoint(folder, model, optimizer, scheduler, train_state: TrainState, model_config: Dict = None):
    """
    Save checkpoint to the given folder path.
    """
    start = time.time()
    torch.distributed.barrier()
    folder = os.path.join(folder, str(train_state.step_count))
    logger.info(
        f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count} from rank:{gpc.get_global_rank()}..."
    )
    timer("save-model").start()
    save_model_checkpoint(folder=folder, model=model)
    timer("save-model").stop()
    timer("save-optimizer").start()
    save_optimizer_checkpoint(optim=optimizer, state_path=folder)
    timer("save-optimizer").stop()
    if gpc.is_rank_for_log():
        scheduler_states = scheduler.state_dict()
        llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
        sampler_state = train_state.batch_sampler.state_dict()
        llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
        llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
        if model_config is not None:
            llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
    torch.distributed.barrier()
    if gpc.is_rank_for_log():
        timer.log(["save-model", "save-optimizer"], logger=logger)
        logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
 def load_optimizer_checkpoint(folder, optim):
    """Load the optimizer state from the local file system or remote
    object storage Service (OSS).
@ -304,19 +267,12 @@ def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, learning_rate, train
        logger.info(f"reload load_scheduler:{lr_scheduler}")
-class CheckpointSaveManager:
+class CheckpointManager:
    """StorageManagerContext"""
-    def __init__(
+    def __init__(self, ckpt_config, model, model_config=None, model_config_file=None, feishu_address=None) -> None:
        self,
        ckpt_config,
        model,
        optimizer,
        lr_scheduler,
        model_config,
    ) -> None:
        """
-        CheckpointSaveManager is used to decide when to store ckpt. If it is an asynchronous
+        CheckpointManager is used to decide when to store ckpt. If it is an asynchronous
        upload mode, you must call wait_async_upload_finish at the end of the program to wait
        for the asynchronous ckpt upload to complete.
@ -332,26 +288,96 @@ class CheckpointSaveManager:
        self.save_ckpt_folder = ckpt_config.save_ckpt_folder
        self.snapshot_ckpt_folder = ckpt_config.snapshot_ckpt_folder
        self.oss_snapshot_freq: int = ckpt_config.oss_snapshot_freq
        self.stop_file_path = ckpt_config.stop_file_path
        self.load_model_only_folder = ckpt_config.load_model_only_folder
        self.feishu_address = feishu_address
        self.storage_manager = get_storage_manager()
        self.snapshot_counter = 0
        self.load_optimizer = gpc.config.ckpt.load_optimizer
        self.model = model
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.model_config = model_config
        self.model_config_file = model_config_file
        if self.stop_file_path and gpc.get_global_rank() == 0:
            dir_path = os.path.dirname(self.stop_file_path)
            if dir_path != "" and not os.path.exists(dir_path):
                os.makedirs(dir_path)
            with open(self.stop_file_path, "w", encoding="utf-8") as f:
                f.write("0")
        if ckpt_config.load_given_ckpt is False:
            # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
            latest_ckpt_path = self.query_lastest_ckpt()
            if latest_ckpt_path:
                self.load_ckpt_folder = latest_ckpt_path
            else:
                # At this time, we have to load model init weights and train from step 0.
                self.load_ckpt_folder = self.load_model_only_folder
        else:
            self.load_ckpt_folder = ckpt_config.load_ckpt_folder
        if gpc.is_rank_for_log():
            logger.info(f"load_ckpt_folder will set to :'{self.load_ckpt_folder}'")
            if self.stop_file_path is None:
                logger.warning("no set stop_file_path, quit_signal_handler is disable")
    def quit_signal_handler(self, train_state) -> bool:
        """
        Exit signal detection function, if we write the exit step in the 'QUIT_FILE_PATH' file,
        all ranks will save ckpt and exit.
        Negative integer step means save ckpt.
        Positive integer step means save ckpt and quit.
        Args:
            train_state (TrainState):
        Returns:
            bool: whether to quit.
        """
        now_break, now_save_ckpt, save_type = False, False, CheckpointType.NORMAL_CHECKPOINT
        if self.stop_file_path is None:
            return now_break, now_save_ckpt, save_type
        with open(self.stop_file_path, "a+", encoding="utf-8") as f:
            fcntl.flock(f, fcntl.LOCK_EX)
            f.seek(0)
            msg = f.read()
            fcntl.flock(f, fcntl.LOCK_UN)
            action_step = int(msg)
        if action_step < 0 and abs(action_step) == train_state.step_count:
            now_save_ckpt = True
        if action_step > 0 and action_step == train_state.step_count:
            now_break, now_save_ckpt = True, True
        if action_step != 0 and gpc.is_rank_for_log():
            msg = "Stop" if action_step > 0 else "Save"
            action_step = abs(action_step)
            if train_state.step_count <= action_step:
                if self.feishu_address:
                    send_alert_message(
                        address=self.feishu_address,
                        message=f"training will {msg} at step_count {action_step}!\
 now step_count is {train_state.step_count}",
                    )
        return now_break, now_save_ckpt, save_type
    def try_save_checkpoint(self, train_state):
        if not self.enable_save_ckpt:
-            return
+            return False
        save_ckpts, save_type = False, CheckpointType.NORMAL_CHECKPOINT
        if self.oss_snapshot_freq > 1 and train_state.step_count % self.oss_snapshot_freq == 0:
            save_ckpts, save_type = True, CheckpointType.SNAPSHOT_CHECKPOINT
        if train_state.step_count % self.checkpoint_every == 0:
            save_ckpts, save_type = True, CheckpointType.NORMAL_CHECKPOINT
        now_break, singal_save_ckpts, singal_save_type = self.quit_signal_handler(train_state)
        if save_ckpts is False:
-            if quit_signal_handler is not None:
+            save_ckpts = singal_save_ckpts
-                save_ckpts, save_type = quit_signal_handler(train_state)
+            save_type = singal_save_type
        if save_ckpts:
            # Wait for the previous round of asynchronous upload storage to complete.
@ -361,18 +387,247 @@ class CheckpointSaveManager:
                self.snapshot_counter = (self.snapshot_counter + 1) % 2
                save_ckpt_folder = os.path.join(self.snapshot_ckpt_folder, f"{self.snapshot_counter}")
            else:
-                save_ckpt_folder = self.save_ckpt_folder
+                save_ckpt_folder = os.path.join(self.save_ckpt_folder, str(train_state.step_count))
-            save_checkpoint(
+            self.save_checkpoint(
                folder=save_ckpt_folder,
                model=self.model,
                optimizer=self.optimizer,
                scheduler=self.lr_scheduler,
                train_state=train_state,
                model_config=self.model_config,
                model_config_file=self.model_config_file,
            )
        return now_break
    def wait_async_upload_finish(self):
        """wait for all checkpoint uploads to be completed"""
        self.storage_manager.wait()
        torch.distributed.barrier()
    def query_latest_snapshot_step_boto3(self):
        """query_latest_snapshot_step_boto3
        Returns:
            Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
        """
        ckpt_list = self.storage_manager.get_fns(self.save_ckpt_folder)
        if len(ckpt_list) == 0:
            return None, None
        max_normal_step = 0
        ckpt_list = list(map(lambda a: int(a.strip("/")) if a.strip("/").isdigit() else 0, ckpt_list))
        ckpt_list.sort(reverse=True)
        for ckpt in ckpt_list:
            fns_list = self.storage_manager.get_fns(os.path.join(self.save_ckpt_folder, str(ckpt)))
            for fn in fns_list:
                if fn.endswith(".step"):
                    max_normal_step = ckpt
                    break
            if max_normal_step != 0:
                break
        max_normal_step = ckpt_list[0]
        load_normal_ckpt_path = os.path.join(self.save_ckpt_folder, str(max_normal_step))
        snapshot_path_0 = os.path.join(self.save_ckpt_folder, "snapshot", "0")
        snapshot_path_1 = os.path.join(self.save_ckpt_folder, "snapshot", "1")
        ckpt_list_1 = self.storage_manager.get_fns(snapshot_path_0)
        ckpt_list_2 = self.storage_manager.get_fns(snapshot_path_1)
        max_step_0, max_step_1 = 0, 0
        for ckpt in ckpt_list_1:
            ckpt = ckpt.strip("/")
            if ckpt.endswith(".step"):
                max_step_0 = max(max_step_0, int(ckpt.split(".")[0]))
        for ckpt in ckpt_list_2:
            ckpt = ckpt.strip("/")
            if ckpt.endswith(".step"):
                max_step_1 = max(max_step_1, int(ckpt.split(".")[0]))
        snap_load_path = snapshot_path_0 if max_step_0 > max_step_1 else snapshot_path_1
        snap_step = max(max_step_0, max_step_1)
        load_path = snap_load_path if snap_step > max_normal_step else load_normal_ckpt_path
        load_step = max(snap_step, max_normal_step)
        return load_path, load_step
    def query_latest_snapshot_step_local(self):
        max_step, max_step_path = 0, None
        for root, _, files in os.walk(self.save_ckpt_folder, followlinks=True):
            for fn in files:
                fn = fn.strip("/")
                if fn.endswith(".step"):
                    # We assume that both normal ckpt and snapshot ckpt will store the '.step' file
                    # as an integrity flag.
                    step = int(fn.rsplit(".", maxsplit=1)[0])
                    if max_step < step:
                        max_step = step
                        max_step_path = root
        return max_step_path, max_step
    def query_lastest_ckpt(self):
        latest_checkpoint = None
        # Training was automatically restarted by the process, forcing the latest snapshot to be read.
        if self.save_ckpt_folder:
            if self.save_ckpt_folder.startswith("boto3"):
                latest_checkpoint, step = self.query_latest_snapshot_step_boto3()
            elif self.save_ckpt_folder.startswith("local"):
                latest_checkpoint, step = self.query_latest_snapshot_step_local()
            else:
                latest_checkpoint, step = None, 0
            if latest_checkpoint is not None:
                if gpc.is_rank_for_log():
                    logger.info(f"Found latest ckpt : {latest_checkpoint}, step: {step}")
                    send_alert_message(
                        address=self.feishu_address,
                        message=f"Auto restart resume from ckpt-path: '{latest_checkpoint}', step : {step}",
                    )
            else:
                if gpc.is_rank_for_log():
                    send_alert_message(
                        address=self.feishu_address,
                        message=f"Can't find snapshot checkpoint, use default load-ckpt path: {latest_checkpoint}",
                    )
        return latest_checkpoint
    def try_load_model(self, current_time=""):
        model_load_path = None
        if self.load_ckpt_folder and self.load_model_only_folder:
            raise ValueError(
                "Error, try to use both load_ckpt_folder and load_model_only_folder paths, \
 if you only need to load model weights (for example starting an SFT task for the first time), \
 set load_model_only_folder path, if you need to resume training from ckpt, \
 set load_ckpt_folder or use default value \
 (if is the default value, internlm will try to load the latest ckpt from save_ckpt_folder)"
            )
        if self.load_ckpt_folder:
            if gpc.is_rank_for_log():
                logger.info(
                    f"===========Resume training from `{self.load_ckpt_folder}` {current_time} on host:"
                    f"{socket.gethostname()}==========="
                )
            model_load_path = self.load_ckpt_folder
        elif self.load_model_only_folder:
            if gpc.is_rank_for_log():
                logger.info(
                    f"===========Load Model from `{self.load_model_only_folder}` {current_time} on host:"
                    f"{socket.gethostname()}==========="
                )
            model_load_path = self.load_model_only_folder
        else:
            if gpc.is_rank_for_log():
                logger.info(
                    f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
                    f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
                    f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
                )
        # Loading model weights must be done before zero is initialized.
        if model_load_path is not None:
            load_model_checkpoint(folder=model_load_path, model=self.model)
    def try_resume_training(self, lr_scheduler, optimizer, lr, train_state, train_dl):
        """Attempt to restore the training state of the last ckpt.
        Args:
            lr_scheduler (_LRScheduler): lr_scheduler object.
            optimizer (Optimizer): optimizer object.
            lr (float): learning rate.
            train_state (dict): traing states.
            train_dl (DataLoader): traning dataloader object
        """
        if self.load_ckpt_folder is not None:
            # load optimzier states.
            if self.load_optimizer:
                load_optimizer_checkpoint(self.load_ckpt_folder, optimizer)
            # load lr scheduler states.
            load_scheduler(self.load_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
            # load training states.
            load_context(self.load_ckpt_folder, train_dl, train_state)
            # load dataloader sampler states.
            if hasattr(train_state, "batch_sampler") and not isinstance(
                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
            ):
                load_sampler(self.load_ckpt_folder, train_dl.batch_sampler)
            if hasattr(train_state, "data_state_dict"):
                train_dl.dataset.load_state_dict(
                    llm_load(os.path.join(self.load_ckpt_folder, "sampler_0.pt")), ckpt_path=self.load_ckpt_folder
                )
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
    def save_checkpoint(
        self,
        folder,
        model,
        optimizer,
        scheduler,
        train_state: TrainState,
        model_config: Dict = None,
        model_config_file: str = None,
    ):
        """
        Save checkpoint to the given folder path.
        """
        start = time.time()
        self.set_save_folder(folder, train_state.step_count)
        torch.cuda.synchronize()
        torch.distributed.barrier()
        if gpc.is_rank_for_log():
            logger.info(f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count}...")
        timer("save-model").start()
        save_model_checkpoint(folder=folder, model=model)
        timer("save-model").stop()
        timer("save-optimizer").start()
        save_optimizer_checkpoint(optim=optimizer, state_path=folder)
        timer("save-optimizer").stop()
        if (
            hasattr(train_state, "data_state_dict")
            and gpc.get_local_rank(ParallelMode.TENSOR) == 0
            and gpc.get_local_rank(ParallelMode.PIPELINE) == 0
        ):
            llm_save(
                os.path.join(folder, f"sampler_{gpc.get_local_rank(ParallelMode.DATA)}.pt"),
                saved_obj=train_state.data_state_dict,
            )
        if gpc.is_rank_for_log():
            scheduler_states = scheduler.state_dict()
            llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
            if hasattr(train_state, "batch_sampler") and not isinstance(
                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
            ):
                sampler_state = train_state.batch_sampler.state_dict()
                llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
            llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
            if model_config is not None:
                # Model configuration dictionary.
                llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
            if model_config_file is not None:
                # The complete training config file content, stored in binary format.
                llm_save(os.path.join(folder, "config_file.pt"), saved_obj=model_config_file)
        torch.distributed.barrier()
        if gpc.is_rank_for_log():
            timer.log(["save-model", "save-optimizer"], logger=logger)
            logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
            if self.storage_manager.async_mode is False:
                llm_save(
                    os.path.join(folder, f"{train_state.step_count}.step"),
                    saved_obj=dict({"step": train_state.step_count}),
                )
    def set_save_folder(self, folder, step):
        self.storage_manager.latest_save_folder = folder
        self.storage_manager.latest_save_step = step
--- a/internlm/utils/simple_memory_profiler.py
+++ b/internlm/utils/simple_memory_profiler.py
@ -1,15 +1,13 @@
 import os
 import time
 from collections import OrderedDict
-from functools import partial
+from functools import partial, reduce
 from typing import Any, Dict, List, Tuple
 import pyecharts
 import torch
-from internlm.core.context import ParallelMode
+from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.context import global_context as gpc
 from internlm.solver.pipeline_utils import partition_uniform
 mb = 1024 * 1024
@ -107,6 +105,8 @@ class SimpleMemState:
        """
        Update the total memory usage of the model and sub-models.
        """
        self._total_mem = self._layer_mem
        for stat in self.sub_model_stats.values():
            # Update sub-model status first.
            stat.update_total_memory()
@ -169,6 +169,39 @@ class SimpleMemState:
            return {"name": self.layer_name, "children": children}
 class ActivationMemState:
    """
    Activation Memory State
    """
    def __init__(self, num_chunks: int) -> None:
        self._num_chunks = num_chunks
        self.inited: List[bool] = [False for _ in range(num_chunks)]
        self.states: List[SimpleMemState] = [SimpleMemState(f"activations_{idx}") for idx in range(num_chunks)]
    @property
    def total_mem(self) -> int:
        return sum(state.total_mem for state in self.states)
    def dump(self, prefix: str = "") -> str:
        return reduce(lambda x, y: x + y, [state.dump(prefix) for state in self.states])
    def to_json(self, base: int = 1024 * 1024) -> List:
        return [state.to_json(base) for state in self.states]
 def _unpack_naive_wrapper(model: torch.nn.Module) -> Tuple[torch.nn.Module, int]:
    num_chunks = len(model) if isinstance(model, torch.nn.ModuleList) else 1
    if num_chunks > 1:
        model = torch.nn.ModuleList([_model.model if isinstance(_model, NaiveAMPModel) else _model for _model in model])
    else:
        model = model.model if isinstance(model, NaiveAMPModel) else model
    return model, num_chunks
 class SimpleMemoryProfiler:
    """
    A memory profiler for a llm model.
@ -177,7 +210,7 @@ class SimpleMemoryProfiler:
        model (torch.nn.Module): The model to profile.
        optimizer (torch.optim.Optimizer): The optimizer used for training the model.
        log_file (str): The file to write the memory state information to.
-        activation_config (List[str], optional): The list of activation layers to track. Defaults to None.
+        total_steps: number of steps to trace.
    """
    def __init__(
@ -186,9 +219,8 @@ class SimpleMemoryProfiler:
        optimizer: torch.optim.Optimizer,
        log_folder: str,
        total_steps: int = 5,
        activation_config: List[str] = None,
    ):
-        self._model = model
+        self._model, self._num_model_chunks = _unpack_naive_wrapper(model)
        self._optimizer = optimizer
        self._log_folder = log_folder
        self._remaining_steps = total_steps
@ -197,17 +229,20 @@ class SimpleMemoryProfiler:
        self._record_start_time = time.time()
        # For activation memory state.
-        self._activation_config = activation_config
+
        self._activation_mem_inited: bool = False
        self._activation_mem: int = 0
-        self._activation_max_count = 0
+        self._activation_mem_max: int = 0
-        self._activation_base_mem: SimpleMemState = SimpleMemState("activations")
+        self._activation_base_mems = ActivationMemState(self._num_model_chunks)
        # Check or create log folder
        os.makedirs(self._log_folder, exist_ok=True)
        # Register activation memory tracking hooks
-        self._register_activation_trace_hooks()
+        if self._num_model_chunks > 1:
            for chunk_id in range(self._num_model_chunks):
                self._register_activation_trace_hooks(chunk_id, self._model[chunk_id])
        else:
            self._register_activation_trace_hooks(0, self._model)
        # Calculate static parameter cuda memory
        self._param_mem_state = SimpleMemState("param_mem")
@ -221,7 +256,7 @@ class SimpleMemoryProfiler:
        self._calc_tensor_group_memory(self._os_params_mem_state, list(enumerate(self._optimizer.param_groups)))
        # Generate the first memory record
-        self.point(create=True)
+        self.point(with_options="params,grads,os_params", create=True)
    def point(self, with_options: str = "", create: bool = False) -> None:
        """
@ -272,7 +307,7 @@ class SimpleMemoryProfiler:
        if "os_state" in options:
            layout_info += "os_state_layout:\n" + self._os_state_mem_state.dump()
        if "activation_base" in options:
-            layout_info += "activation_base_layout:\n" + self._activation_base_mem.dump()
+            layout_info += "activation_base_layout:\n" + self._activation_base_mems.dump()
        # Write memory state information to log file
        file_mode = "w" if create else "a"
@ -315,14 +350,14 @@ class SimpleMemoryProfiler:
                [self._os_params_mem_state.to_json(), self._os_state_mem_state.to_json()],
                "os_memory_sunburst",
            )
-            self._render_sunburst_chart(self._activation_base_mem.to_json()["children"], "activation_memory_sunburst")
+            self._render_sunburst_chart(self._activation_base_mems.to_json(), "activation_memory_sunburst")
            # Generate summary sunburst chart
            summary_sunburst_data = [
                {"name": "params", "value": self._param_mem_state.total_mem // mb},
                {"name": "grads", "value": self._grad_mem_state.total_mem // mb},
                {"name": "os_params", "value": self._os_params_mem_state.total_mem // mb},
                {"name": "os_state", "value": self._os_state_mem_state.total_mem // mb},
-                {"name": "activation", "value": self._activation_base_mem.total_mem // mb},
+                {"name": "activation", "value": self._activation_mem_max // mb},
            ]
            self._render_sunburst_chart(summary_sunburst_data, "summary_sunburst")
@ -337,12 +372,13 @@ class SimpleMemoryProfiler:
                {},
                {
                    "r0": "10%",
-                    "r": "40%",
+                    "r": "35%",
                    "itemStyle": {"borderWidth": 3},
                    "label": {"align": "left"},
                },
-                {"r0": "40%", "r": "65%", "label": {"align": "left"}},
+                {"r0": "35%", "r": "55%", "label": {"align": "left"}},
-                {"r0": "65%", "r": "80%", "label": {"align": "left"}},
+                {"r0": "55%", "r": "70%", "label": {"align": "left"}},
                {"r0": "70%", "r": "80%", "label": {"align": "left"}},
                {"r0": "80%", "r": "90%", "label": {"align": "left"}},
                {
                    "r0": "90%",
@ -357,7 +393,14 @@ class SimpleMemoryProfiler:
            f"{self._log_folder}/{name}.html"
        )
-    def _inner_activation_trace_hook(self, layer_name: str, model: Any, inputs: Any, output: torch.Tensor) -> None:
+    def _inner_activation_trace_hook(
        self,
        chunk_id: int,
        layer_name: str,
        model: Any,
        inputs: Any,
        output: torch.Tensor,
    ) -> None:
        """
        Hook function to trace the activation memory usage for a inner layer.
@ -373,13 +416,15 @@ class SimpleMemoryProfiler:
        del model, inputs
        assert isinstance(output, torch.Tensor), f"Invalid output type: {type(output)}"
-        if self._stoped or self._activation_mem_inited:
+        if self._stoped or self._activation_base_mems.inited[chunk_id]:
            return
        # Delay updating the total_mem of activation_base_mem here, it will be handled in the forward ending hook.
-        self._activation_base_mem.add(layer_name, output.element_size() * output.nelement(), flush=False)
+        self._activation_base_mems.states[chunk_id].add(
            layer_name, output.element_size() * output.nelement(), flush=False
        )
-    def _activation_trace_hook_forward(self, model: Any, inputs: Any, output: torch.Tensor) -> None:
+    def _activation_trace_hook_forward(self, chunk_id: int, model: Any, inputs: Any, output: torch.Tensor) -> None:
        """
        Hook function to trace the activation memory usage for a forward pass.
@ -398,23 +443,24 @@ class SimpleMemoryProfiler:
            return
        # Check if the activation memory has been initialized
-        if self._activation_mem_inited is False:
+        if self._activation_base_mems.inited[chunk_id] is False:
            self._activation_base_mems.inited[chunk_id] = True
            # Update the total memory of the activation base memory state
-            self._activation_base_mem.update_total_memory()
+            self._activation_base_mems.states[chunk_id].update_total_memory()
            # Set with_options to "activation_base" to include activation_base_layout in the memory dump
-            self._activation_mem_inited = True
+            with_options = "activation_base"
        else:
            with_options = ""
        # Accumulate activation memory usage for each forward pass
-        self._activation_mem += self._activation_base_mem.total_mem
+        self._activation_mem += self._activation_base_mems.states[chunk_id].total_mem
-
+        if self._activation_mem > self._activation_mem_max:
-        # Update activation max count
+            self._activation_mem_max = self._activation_mem
        if self._activation_mem // self._activation_base_mem.total_mem > self._activation_max_count:
            self._activation_max_count = self._activation_mem // self._activation_base_mem.total_mem
        # Trigger a memory record
-        self.point()
+        self.point(with_options)
-    def _activation_tarce_hook_backward(self, model: Any, inputs: Any, grad_outputs: Any) -> None:
+    def _activation_tarce_hook_backward(self, chunk_id: int, model: Any, inputs: Any, grad_outputs: Any) -> None:
        """
        Hook function to trace the activation memory usage for a backward pass.
@ -432,37 +478,28 @@ class SimpleMemoryProfiler:
            return
        # Release activation memory usage for each backward pass
-        self._activation_mem -= self._activation_base_mem.total_mem
+        self._activation_mem -= self._activation_base_mems.states[chunk_id].total_mem
        # Trigger a memory record
        self.point()
-    def _register_activation_trace_hooks(self) -> None:
+    def _register_activation_trace_hooks(self, chunk_id: int, model_chunk: torch.nn.Module) -> None:
        """
        Register activation trace hooks for the model and each submodule in the model.
        """
        # Register inner activation trace hooks for each submodule in the model
-        for layer_name in self._activation_config:
+        for layer_name, sub_model in model_chunk.named_modules():
            # Register a hook for every activation
            model = self._model
            sub_models = layer_name.split(".")
            # Get the target sub-model
            for sub_model_name in sub_models:
                try:
                    model = model.get_submodule(sub_model_name)
                except AttributeError:
                    model = None
                    break
            # Register the hook
-            if model is not None:
+            if len(sub_model._modules) != 0:
-                model.register_forward_hook(partial(self._inner_activation_trace_hook, layer_name))
+                continue  # TODO: in some special cases, we may need some additional configuration to correct
            sub_model.register_forward_hook(partial(self._inner_activation_trace_hook, chunk_id, layer_name))
        # Register a forward hook for the main model to track activation memory usage
-        self._model.register_forward_hook(self._activation_trace_hook_forward)
+        model_chunk.register_forward_hook(partial(self._activation_trace_hook_forward, chunk_id))
        # Register a backward hook for the main model to release activation memory usage
-        self._model.register_full_backward_hook(self._activation_tarce_hook_backward)
+        model_chunk.register_full_backward_hook(partial(self._activation_tarce_hook_backward, chunk_id))
    def _calc_tensor_memory(
        self, root_stat: SimpleMemState, named_tensors: Dict[str, torch.Tensor], require_grad: bool = False
@ -554,48 +591,6 @@ class SimpleMemoryProfiler:
            self._calc_tensor_memory(root_stat, named_tensors)
 def build_activation_config(num_layers: int, num_chunks: int = 1) -> List[str]:
    # TODO: support interleaved pipeline scheduling.
    assert num_chunks == 1, "Only support num_chunks == 1"
    if gpc.is_initialized(ParallelMode.PIPELINE):
        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
    else:
        pipeline_size = 1
        pipeline_rank = 0
    all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
    parts = all_parts[pipeline_rank]
    start, end = parts[0]
    num_blocks = end - start
    block_conf_tmpl = [
        "mixer.rotary_emb",
        "mixer.Wqkv",
        "mixer.inner_attn",
        "mixer.inner_cross_attn",
        "mixer.out_proj",
        # "dropout1", # skip when dropout_selective_checkpoint is True
        # "dropout2", # skip when dropout_selective_checkpoint is True
        "norm1",
        "norm2",
        "mlp.w1",
        "mlp.w2",
        "mlp.w3",
    ]
    block_conf = []
    for block_id in range(num_blocks):
        block_conf += [f"blocks.{block_id}.{layer}" for layer in block_conf_tmpl]
    # We don't need to care about whether the embedding, norm, and head layers exist in the model after partitioning.
    # If they don't exist, they will be automatically ignored when registering activation trace hooks.
    activation_conf = ["embedding", "norm", "head"] + block_conf
    return activation_conf
 if __name__ == "__main__":
    class SimpleModel(torch.nn.Module):
@ -635,32 +630,39 @@ if __name__ == "__main__":
            return output
    def _simple_schedule(_num_chunks, _model_chunks, _input) -> torch.Tensor:
        if _num_chunks > 1:
            _output = _input
            for _model_chunk in _model_chunks:
                _output = _model_chunk(_output)
        else:
            _output = _model_chunks(_input)
        return _output
    # num_chunks config
    _num_chunks = 1
    # init model and optimizer
-    _model: torch.nn.Module = SimpleModel()
+    if _num_chunks > 1:
        _chunks = [SimpleModel(skip_layer2=idx % 2 == 0) for idx in range(_num_chunks)]
        _model = torch.nn.ModuleList(_chunks).cuda()
    else:
        _model: torch.nn.Module = SimpleModel().cuda()
    _optimizer = torch.optim.Adam(_model.parameters())
    # create activation config for simple model layer by layer.
    activation_configs = [
        # model level 0
        "layer1",
        "layer2",
        "layer3",
        # model level 1
        "layer2.layer1",
        "layer2.layer3",
    ]
    _model.modules()
    # init profiler
-    profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler.log", activation_configs)
+    profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler", total_steps=1)
    _optimizer.zero_grad()
-    x1 = torch.randn((128, 5120))
+    # inputs
-    x2 = torch.randn((128, 5120))
+    x1 = torch.randn((128, 5120)).cuda()
-    out1 = _model(x1)
+    x2 = torch.randn((128, 5120)).cuda()
-    out2 = _model(x2)
+    # forward
    out1 = _simple_schedule(_num_chunks, _model, x1)
    out2 = _simple_schedule(_num_chunks, _model, x2)
    # backward
    out1.mean().backward()
    out2.mean().backward()
--- a/internlm/utils/storage_manager.py
+++ b/internlm/utils/storage_manager.py
@ -15,8 +15,6 @@ from asyncio.tasks import ALL_COMPLETED
 from datetime import datetime
 from typing import Any, Awaitable, Callable, Dict, List, Union
 import boto3
 import botocore
 import torch
 import torch.distributed as dist
@ -24,6 +22,13 @@ from internlm.core.context import global_context as gpc
 from internlm.utils.common import SingletonMeta
 from internlm.utils.logger import get_logger
 try:
    import boto3
    import botocore
 except ImportError:
    pass
 logger = get_logger(__file__)
 boto3_url_re = re.compile(r"([^\.]+)\.([\d\.]+)")
@ -234,13 +239,13 @@ class Boto3Client(StorageClient):
        """
        paginator = handler.client.get_paginator("list_objects_v2")
        pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
        folder_name_list = []
        for page in pages:
-            for obj in page["Contents"]:
+            if "Contents" in page:
-                fp: str = obj["Key"]
+                for obj in page["Contents"]:
-                folder_name_list.append(fp.rsplit("/", maxsplit=1)[1])
+                    pth: str = obj["Key"]
-        return folder_name_list
+                    folder_name_list.append(pth.split(fp, maxsplit=1)[1].strip("/").split("/", maxsplit=1)[0])
        return list(set(folder_name_list))
    @staticmethod
    def async_upload_fileobj(handler, bucket_name: str, fp: str, local_nvme_path: str):
@ -391,6 +396,11 @@ class StorageManager(metaclass=SingletonMeta):
        self.tmp_local_folder = tmp_local_folder
        self.async_mode = async_mode
        self.has_warning = False
        self._async_loop = None
        self._thread_pool = None
        self.latest_save_folder = None
        self.latest_save_step = 0
        self.async_task_peeding = False
        if enable_save and self.async_mode:
            self._async_loop = asyncio.new_event_loop()
@ -485,6 +495,7 @@ class StorageManager(metaclass=SingletonMeta):
                torch.save(saved_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
            self.async_executor(meta.async_upload_fn, *unpack_meta(meta))
            os.chmod(tmp_step_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
            self.async_task_peeding = True
        else:
            meta.client.sync_upload_fileobj(*unpack_meta(meta), *args, saved_obj=saved_obj, **kwargs)
            self.upload_count += 1
@ -523,23 +534,22 @@ class StorageManager(metaclass=SingletonMeta):
                    pass
    async def _sync_tasks(self) -> Awaitable[None]:
-        if not self._async_stack:
+        if self._async_stack:
-            return
+            await asyncio.wait(self._async_stack, return_when=ALL_COMPLETED)
-
+            count = 0
-        await asyncio.wait(self._async_stack, return_when=ALL_COMPLETED)
+            while self._async_stack:
-
+                t = self._async_stack[0]
-        for task in self._async_stack:
+                try:
-            try:
+                    e = t.exception()
-                task.exception()
+                    if e:
-            except InvalidStateError:
+                        self._exception_list.append((e, count))
-                continue
+                        logger.error(f"File:{self._to_be_del_files[count]}, upload failed for {e}")
-            except Exception as e:
+                        # raise e
-                file_id = len(self._exception_list)
+                    count += 1
-                self._exception_list.append((e, file_id))
+                    self._async_stack.pop(0)
-
+                except InvalidStateError:
-                logger.error(f"File: {self._to_be_del_files[file_id]}, " f"upload failed with {e}")
+                    # Not finished. https://docs.python.org/3/library/asyncio-task.html#asyncio.Task.exception
-
+                    pass
        self._async_stack.clear()
    def async_executor(self, fn: Callable, *args, **kwargs) -> None:
        """
@ -559,11 +569,14 @@ class StorageManager(metaclass=SingletonMeta):
        if not self.async_mode:
            return
        if not self.async_task_peeding:
            return
        if self._async_loop:
            self._async_loop.run_until_complete(self._sync_tasks())
        if self._exception_list:
-            for file_id, error_msg in self._exception_list:
+            for error_msg, file_id in self._exception_list:
                logger.error(
                    f"Node:{socket.gethostname()}, Error: Checkpoint {self._to_be_del_files[file_id]} "
                    f"failed on step {self.upload_count}: {error_msg}"
@ -577,10 +590,16 @@ class StorageManager(metaclass=SingletonMeta):
        self._del_tmp_folder()
        self._exception_list.clear()
        self._to_be_del_files.clear()
        self.async_task_peeding = False
        if gpc.is_rank_for_log():
            logger.info("all async uploads succeeded!")
            self.upload_count += 1
            if self.async_mode:
                self.save(
                    os.path.join(self.latest_save_folder, f"{self.latest_save_step}.step"),
                    saved_obj=dict({"step": self.latest_save_step}),
                    async_upload=False,
                )
 storage_manager: StorageManager = None
--- a/internlm/utils/writer.py
+++ b/internlm/utils/writer.py
@ -11,10 +11,6 @@ from torch.utils.tensorboard import SummaryWriter
 from internlm.core.context import global_context as gpc
 def copy_ignore_folder(source_path, target_path):
    os.system(f"cp -r {source_path}/* {target_path}/")
 def tb_save_run_info(writer, config_lines, global_step=0):
    writer.add_text(tag="cmd", text_string=" ".join(sys.argv[:]), global_step=global_step)
    lines = []
@ -42,9 +38,21 @@ def init_tb_writer(
        tb_folder = tensorboard_folder
    if gpc.get_global_rank() == 0:
        # If we don't load ckpt, 'resume_tb_folder' is set as the tensorboard
        # dir of the last task by 'make_launch_script.sh'.
        # If we load ckpt, 'resume_tb_folder' will be overwritten as the
        # reloaded 'train_state.resume_tb_folder'.s
        if resume_tb_folder is not None:
-            logger.info(f"Try mv tensorboard logs: {resume_tb_folder} to {tb_folder}...")
+            assert len(resume_tb_folder) > 0 and resume_tb_folder != "/"
-            copy_ignore_folder(resume_tb_folder, tb_folder)
+            if not os.path.exists(resume_tb_folder):
                logger.error(
                    f"Can't found resume_tb_folder{resume_tb_folder}, \
 please make sure this folder is located at local file system."
                )
            else:
                logger.info(f"Try mv tensorboard logs: {resume_tb_folder} to {tb_folder}... ")
                os.system(f"cp -r {resume_tb_folder}/* {tb_folder}/")
                os.system(f"chmod -R +w {tb_folder}/")
        else:
            logger.info(f"Login tensorboard logs to: {tb_folder}")
@ -126,6 +134,14 @@ class Writer:
        except Exception:
            traceback.print_exc()
    def add_scalars(self, key, value, step):
        try:
            assert isinstance(value, dict)
            if self.enable_tb and self.tb_writer is not None:
                self.tb_writer.add_scalars(main_tag=key, tag_scalar_dict=value, global_step=step)
        except Exception:
            traceback.print_exc()
    def add_text(self, key, value, step):
        try:
            if self.enable_tb and self.tb_writer is not None:
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -13,4 +13,4 @@ boto3
 botocore
 torch-scatter
 pyecharts
-f https://data.pyg.org/whl/torch-1.13.0+cu117.html
+-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
--- a/train.py
+++ b/train.py
@ -5,99 +5,48 @@ import socket
 import time
 import traceback
 from functools import partial
 from typing import Iterable
 import numpy as np
 import torch
 import torch.distributed as dist
 from torch import nn
 from torch.utils.data import DataLoader
 import internlm
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.scheduler import SchedulerMetricHook
 from internlm.core.trainer import TrainState
-from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
+from internlm.initialize import initialize_distributed_env
 from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
 from internlm.data.dataset import get_dataset_dict
 from internlm.data.dummy_dataset import RandomDataset
 from internlm.data.packed_dataset import (
    PackedDataset,
    PackedDatasetWithoutCuSeqlen,
    get_packed_dataset_without_short_length,
 )
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.model.loss import FlashGPTLMLoss
 from internlm.model.metrics import AccPerplex
-from internlm.monitor import initialize_monitor_manager, send_alert_message, set_env_var
+from internlm.monitor import initialize_monitor_manager, send_alert_message
 from internlm.monitor.monitor import monitor_manager as mm
-from internlm.solver.beta2_scheduler import Beta2Scheduler
+from internlm.train import (
-from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
+    get_train_data_loader,
-from internlm.solver.optimizer import HybridZeroOptimizer
+    get_validation_data_loader,
    initialize_llm_profile,
    initialize_model,
    initialize_optimizer,
    load_new_batch,
    record_current_batch_training_metrics,
 )
 from internlm.utils.common import (
    BatchSkipper,
    DummyProfile,
    get_master_node,
    get_megatron_flops,
    launch_time,
    parse_args,
 )
 from internlm.utils.evaluation import evaluate_on_val_dls
 from internlm.utils.gputest import bench_gpu, bench_net
 from internlm.utils.logger import get_logger, initialize_uniscale_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
-from internlm.utils.model_checkpoint import (
+from internlm.utils.model_checkpoint import CheckpointManager
-    CheckpointSaveManager,
+from internlm.utils.parallel import get_parallel_log_file_name
-    load_context,
+from internlm.utils.simple_memory_profiler import SimpleMemoryProfiler
    load_model_checkpoint,
    load_optimizer_checkpoint,
    load_sampler,
    load_scheduler,
 )
 from internlm.utils.parallel import (
    get_parallel_log_file_name,
    is_no_pp_or_last_stage,
    sync_model_param,
    sync_model_param_within_tp,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.simple_memory_profiler import (
    SimpleMemoryProfiler,
    build_activation_config,
 )
 from internlm.utils.writer import Writer
 # global llm logger
 logger = get_logger(__file__)
 def initialize_distributed_env(config: str, launcher: str = "slurm", master_port: int = 8888, seed: int = 1024):
    """
    Initialize distributed environment for distributed training.
    Args:
        config (str): Config file path.
        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
        master_port (str): The master port for distributed training. 8888 by default.
        seed (int, optional): Specified random seed for every process. 1024 by default.
    """
    torch.cuda.empty_cache()
    if launcher == "torch":
        internlm.launch_from_torch(config=config, seed=seed)
    elif launcher == "slurm":
        internlm.launch_from_slurm(
            config=config,
            host=get_master_node(),
            port=master_port,
            seed=seed,
        )
    else:
        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
 def initialize_llm_logger(start_time: str):
    """
    Initialize customed uniscale logger.
@ -118,357 +67,14 @@ def initialize_llm_logger(start_time: str):
    return uniscale_logger
 def initialize_model():
    """
    Initialize model.
    Returns: The neural network model to be trained or evaluated.
    """
    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
    if isinstance(model, nn.ModuleList):
        model = nn.ModuleList(
            [
                NaiveAMPModel(
                    model=_m,
                    output_to_fp32=False,  # manually controlled by interleaved pipleline scheduler
                    dtype=gpc.config.model.get("dtype", torch.half),
                    sync_buffer=False,
                )
                for _m in model
            ]
        )
    else:
        model = NaiveAMPModel(
            model=model,
            output_to_fp32=is_no_pp_or_last_stage(),
            dtype=gpc.config.model.get("dtype", torch.half),
            sync_buffer=False,
        )
    # This sync is very important, cause the model weights kept in optimizer are copied
    # from the origin parameters in the memory, so we should make sure the dp sync
    # does not influence the model weights in optimizer be different with the origin parameters.
    sync_model_param(model, parallel_mode=ParallelMode.DATA)
    # This function is needed to make sure parameters that are not splitted by tensor parallelism are
    # the same across tensor parallelism.
    sync_model_param_within_tp(model)
    return model
 def get_train_data_loader(num_worker: int = 0):
    """
    Generate and return the training data loader.
    Returns: A tuple of (train_dl, dataset_types).
    """
    # Get the dataset types
    dataset_types = None
    dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
    data_cfg = gpc.config.data
    # Get the sample weight dictionary
    train_folder = data_cfg.train_folder
    if not train_folder:
        train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
        if data_cfg.pack_sample_into_one:
            train_ds = PackedDatasetWithoutCuSeqlen(
                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
            )
        else:
            train_ds = PackedDataset(
                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
            )
    else:
        train_ds = get_packed_dataset_without_short_length(
            folder=data_cfg.train_folder,
            packed_length=data_cfg.packed_length,
            max_length_per_sample=data_cfg.seq_len,
            show_progress=dist.get_rank() == 0,
            min_length=data_cfg.min_length,
            min_length_dict=data_cfg.get("min_length_dict", {}),
            pack_into_one_sample=data_cfg.pack_sample_into_one,
        )
    # partition already completed
    # assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen))
    if isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen)):
        datasets = [train_ds]
    else:
        datasets = train_ds.datasets
    # Create the training dataset sampler
    train_sampler = StaticBatchSampler(
        datasets,
        batch_size=data_cfg.micro_num,
        rampup_batch_size=data_cfg.rampup_batch_size,
        micro_bsz=data_cfg.micro_bsz,
        seed=1024,
        drop_last=True,
        data_rank=gpc.get_local_rank(ParallelMode.DATA),
        data_world_size=gpc.get_world_size(ParallelMode.DATA),
    )
    train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
    # Create the training data loader
    train_dl = DataLoader(
        dataset=train_ds,
        batch_sampler=train_sampler,
        num_workers=num_worker,
        pin_memory=True,
        collate_fn=train_collate_fn,
        persistent_workers=True,
    )
    return train_dl, dataset_types
 def get_validation_data_loader(num_worker: int = 0):
    """Generate and return the validation data loader."""
    data_cfg = gpc.config.data
    if not data_cfg.valid_folder:
        val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
    else:
        val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
    if not isinstance(val_ds, dict):
        val_ds = {"val": val_ds}
    val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
    val_dls = {}
    for val_name, ds in val_ds.items():
        # making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
        # otherwise too much data may be dropped
        batch_size = min(
            data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
        )
        batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
        if batch_size == 0 and gpc.is_rank_for_log():
            logger.info(f"skip validate {val_name}.")
            continue
        val_dls[val_name] = get_dpsampler_dataloader(
            ds, shuffle=False, num_workers=num_worker, batch_size=batch_size, collate_fn=val_collate_fn, drop_last=True
        )  # drop_last=True, otherwise it may cause problems in the last batch
        if gpc.is_rank_for_log():
            logger.info(
                f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
                f"samples {str(len(val_dls[val_name]))}."
            )
    return val_dls
 def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
    """
    Load and return the new batch data based on training data loader.
    Args:
        train_dl (torch.utils.data.DataLoader): Dataloader for training.
        train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
        train_state (TrainState): Current training state.
    Returns: A batch data and the updated train_iter.
    """
    timer("batch-gen").start()
    try:
        batch = next(train_iter)  # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
        next(train_state.batch_sampler_iter)
    except StopIteration:
        train_iter = iter(train_dl)
        batch = next(train_iter)
        train_state.batch_sampler_iter = iter(train_state.batch_sampler)
        next(train_state.batch_sampler_iter)
        train_state.num_consumed_samples_in_epoch = 0
    timer("batch-gen").stop()
    if batch[0].get("type_ids", None) is not None:
        # if use_flash_attn is False, we need to unpack type_ids
        if not gpc.config.model.use_flash_attn:
            batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
    return batch, train_iter
 def initialize_optimizer(model: nn.Module):
    """
    Initialize optimizer.
    Args:
        model (torch.nn.Module): Your model instance to be trained or evaluated.
    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
    adam_cfg = gpc.config.adam
    naive_optimizer = torch.optim.AdamW(
        params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
        lr=adam_cfg.lr,
        betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
        eps=adam_cfg.adam_eps,
    )
    optimizer = HybridZeroOptimizer(
        naive_optimizer, grad_scal_cfg=gpc.config.grad_scaler, zero_cfg=gpc.config.hybrid_zero_optimizer
    )
    beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
    lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
    return optimizer, beta2_scheduler, lr_scheduler
 def initialize_llm_profile(profiling: bool = False, start_time: str = None):
    """Initialize and return the profiler context manager instance."""
    if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
        llm_profile = torch.profiler.profile
        logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
    else:
        llm_profile = DummyProfile
    return llm_profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
            + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
            + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
        ),
        with_stack=True,
        with_modules=True,
    )
 def record_current_batch_training_metrics(
    get_tflops_func,
    logger,
    writer,
    success_update,
    batch_count,
    batch,
    train_state,
    optimizer,
    beta2_scheduler,
    trainer,
    start_time,
    loss,
    grad_norm,
    metric,
    update_panel,
 ):
    """
    Print some training metrics of current batch.
    """
    set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
    if success_update in (0, True):
        train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
    if is_no_pp_or_last_stage():
        acc_perplex = metric.get_metric()
    if success_update and gpc.is_rank_for_log():
        lr = optimizer.param_groups[0]["lr"]
        if hasattr(trainer.engine.optimizer, "grad_scaler"):
            scaler = trainer.engine.optimizer.grad_scaler._scale.item()
        elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
            scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
        num_tokens_in_batch = batch[1].nelement()
        num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
        max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        tk_per_gpu = 0
        tk_per_gpu = round(
            num_tokens_in_batch
            * gpc.get_world_size(ParallelMode.DATA)
            / gpc.get_world_size(ParallelMode.GLOBAL)
            / (time.time() - start_time),
            2,
        )
        tflops = get_tflops_func((time.time() - start_time))
        infos = {
            "tflops": tflops,
            "step": batch_count,
            "loss": loss.item(),
            "tgs (tokens/gpu/second)": tk_per_gpu,
            "lr": lr,
            "loss_scale": scaler,
            "grad_norm": grad_norm,
        }
        infos["micro_num"] = len(batch[1])
        infos["num_consumed_tokens"] = train_state.num_consumed_tokens
        infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
        infos["num_samples_in_batch"] = num_samples_in_batch  # the number of batches which have the most samples
        infos["largest_length"] = max_length_in_batch  # the longest input
        infos["largest_batch"] = max_samples_in_batch  # the batch with the most samples
        infos["smallest_batch"] = min_samples_in_batch
        infos["adam_beta2"] = beta2_scheduler.get_beta2()
        fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
        infos["fwd_bwd_time"] = fwd_bwd_time
        for key, value in acc_perplex.items():
            infos[key] = value
        line = ""
        for key, value in infos.items():
            line += f"{key}={value} "
            writer.add_scalar(key=key, value=value, step=train_state.step_count)
        if update_panel:
            logger.info(
                line,
                extra={
                    "step": batch_count,
                    "lr": lr,
                    "num_consumed_tokens": train_state.num_consumed_tokens,
                    "grad_norm": grad_norm,
                    "loss": loss.item(),
                    "flops": tflops,
                    "tgs": tk_per_gpu,
                    "acc": acc_perplex["acc"],
                    "perplexity": acc_perplex["perplexity"],
                    "fwd_bwd_time": fwd_bwd_time,
                },
            )
        else:
            logger.info(line)
        # if loss spike occurs, send alert info to feishu
        mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
 def main(args):
    # init setting
    skip_batches = gpc.config.data.skip_batches
    total_steps = gpc.config.data.total_steps
    valid_every = gpc.config.data.valid_every
    load_optimizer = gpc.config.ckpt.load_optimizer
    label_smoothing = gpc.config.loss.label_smoothing
    lr = gpc.config.adam.lr
    load_model_only_folder = gpc.config.ckpt.get("load_model_only_folder", None)
    load_resume_ckpt_folder = gpc.config.ckpt.get("load_ckpt_folder", None)
    get_tflops_func = partial(
        get_megatron_flops,
        checkpoint=gpc.config.model.checkpoint,
@ -490,46 +96,22 @@ def main(args):
    # initialize customed llm logger
    uniscale_logger = initialize_llm_logger(start_time=current_time)
    # initialize customed llm writer
    with open(args.config, "r") as f:
        config_lines = f.readlines()
    writer = Writer(
        job_name=gpc.config.JOB_NAME,
        launch_time=current_time,
        file_name=get_parallel_log_file_name(),
        tensorboard_folder=gpc.config.tensorboard_folder,
        resume_tb_folder=gpc.config.resume_tb_folder,
        config=config_lines,
        logger=logger,
        enable_tb=gpc.config.enable_tb,
    )
    model_load_path = None
    if load_resume_ckpt_folder is not None:
        logger.info(
            f"===========Resume training from `{load_resume_ckpt_folder}` {current_time} on host:"
            f"{socket.gethostname()}==========="
        )
        model_load_path = load_resume_ckpt_folder
    elif load_model_only_folder is not None:
        logger.info(
            f"===========SFT training from `{load_model_only_folder}` {current_time} on host:"
            f"{socket.gethostname()}==========="
        )
        model_load_path = load_model_only_folder
    else:
        logger.info(
            f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
            f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
            f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
        )
    # initialize and resume train state
    train_state = TrainState(gpc.config)
    # initialize model
    model = initialize_model()
    with open(args.config, "r") as f:
        config_lines = f.readlines()
    ckpt_manager = CheckpointManager(
        ckpt_config=gpc.config.ckpt,
        model=model,
        model_config=gpc.config.model,
        model_config_file="".join(config_lines),
        feishu_address=gpc.config.alert_address,
    )
    # initialize loss function
    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
@ -539,29 +121,24 @@ def main(args):
    train_state.init_batch_sampler(train_dl)
    # Loading model weights must be done before zero is initialized.
-    if model_load_path is not None:
+    ckpt_manager.try_load_model(current_time)
        load_model_checkpoint(folder=model_load_path, model=model)
    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
    # Loading other persistent training states.
-    if load_resume_ckpt_folder is not None:
+    ckpt_manager.try_resume_training(lr_scheduler, optimizer, lr, train_state, train_dl)
        # load lr scheduler states.
        load_scheduler(load_resume_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
        # load training states.
        load_context(load_resume_ckpt_folder, train_dl, train_state)
        # load dataloader sampler states.
        load_sampler(load_resume_ckpt_folder, train_dl.batch_sampler)
        # load optimzier states.
        if load_optimizer:
            load_optimizer_checkpoint(load_resume_ckpt_folder, optimizer)
-    ckpt_save_manager = CheckpointSaveManager(
+    # initialize customed llm writer
-        ckpt_config=gpc.config.ckpt,
+    writer = Writer(
-        model=model,
+        job_name=gpc.config.JOB_NAME,
-        optimizer=optimizer,
+        launch_time=current_time,
-        lr_scheduler=lr_scheduler,
+        file_name=get_parallel_log_file_name(),
-        model_config=gpc.config.model,
+        tensorboard_folder=gpc.config.tensorboard_folder,
        resume_tb_folder=train_state.resume_tb_folder,  # resume from ckpt.
        step_count=train_state.step_count,  # resume from ckpt.
        config=config_lines,
        logger=logger,
        enable_tb=gpc.config.enable_tb,
    )
    # initialize metric for calculating accuracy and perplexity
@ -598,12 +175,11 @@ def main(args):
    # initialize simple memory profiler
    if args.profiling:
        memory_profiler = SimpleMemoryProfiler(
-            model.model,
+            model,
            optimizer.optim,
            log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
            + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
            activation_config=build_activation_config(gpc.config.model.num_layers),
        )
    else:
        memory_profiler = None
@ -621,6 +197,8 @@ def main(args):
        for batch_count in range(train_state.batch_count, total_steps):
            if batch_count % 50 == 0:
                torch.cuda.empty_cache()
                bench_gpu()
                bench_net()
            start_time = time.time()
            timer("one-batch").start()
@ -645,6 +223,7 @@ def main(args):
            # do forward and backward
            timer("fwd-bwd").start()
            _, _, loss = trainer.execute_schedule(
                batch, forward_only=False, return_loss=True, return_output_label=False
            )
@ -659,7 +238,7 @@ def main(args):
                train_state.step_count += 1
            else:
                train_state.inf_nan_skip_batches += 1  # record the amount of updating parameters unsuccessfully.
-                if -99.0 in grad_norm_groups and gpc.is_rank_for_log():  # -99.0 encodes a specific failure case
+                if -1 in grad_norm_groups.values() and gpc.is_rank_for_log():  # -1 encodes a specific failure case
                    logger.warning(f"Warning: skip parameter update at step {batch_count}.")
                    send_alert_message(
                        address=gpc.config.alert_address,
@ -680,7 +259,7 @@ def main(args):
                trainer=trainer,
                start_time=start_time,
                loss=loss,
-                grad_norm=np.array(grad_norm_groups),
+                grad_norm=grad_norm_groups,
                metric=metric,
                update_panel=uniscale_logger is not None,
            )
@ -700,14 +279,17 @@ def main(args):
            # checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
            # # save batch sampler that tracks the true consumed samples
-            ckpt_save_manager.try_save_checkpoint(train_state)
+            now_break = ckpt_manager.try_save_checkpoint(train_state)
            if now_break:
                break
            if memory_profiler is not None:
                memory_profiler.step()
-            prof.step()
+            if batch_count % 2 == 0:
                prof.step()
-    ckpt_save_manager.wait_async_upload_finish()
+    ckpt_manager.wait_async_upload_finish()
 if __name__ == "__main__":