Merge branch 'develop' of github.com:InternLM/InternLM into feature_add_moe

Conflicts: internlm/core/context/parallel_context.py internlm/core/context/process_group_initializer.py internlm/model/modeling_internlm.py internlm/solver/optimizer/hybrid_zero_optim.py internlm/train/training_internlm.py internlm/utils/model_checkpoint.py train.py
2023-09-12 18:04:48 +08:00 · 2023-09-12 18:04:48 +08:00 · d218a62b79
parent b10e5132fe 85e39aae67
commit d218a62b79
88 changed files with 6170 additions and 506 deletions
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,28 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/code-docs/source/conf.py
+  fail_on_warning: false
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: doc/code-docs/requirements.txt
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@ -1,4 +1,5 @@
 JOB_NAME = "7b_train"
+DO_ALERT = False

 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
@ -22,13 +23,16 @@ CHECKPOINT_EVERY = 50
 ckpt = dict(
    enable_save_ckpt=False,  # enable ckpt save.
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
-    # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
-    load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
    checkpoint_every=CHECKPOINT_EVERY,
    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
 )

@ -52,6 +56,8 @@ data = dict(
    min_length=50,
    # train_folder=TRAIN_FOLDER,
    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
 )

 grad_scaler = dict(
@ -149,3 +155,12 @@ parallel = dict(

 cudnn_deterministic = False
 cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
--- a/doc/code-docs/Makefile
+++ b/doc/code-docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
@ -0,0 +1,123 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/checkpoint.rst:2 09c8645fba264cdf9a80c4b62c2bb4d1
+msgid "模型保存"
+msgstr "Model Checkpointing"
+
+#: ../../source/checkpoint.rst:4 8b158d34631045b1afdb4fb0169b3c71
+msgid ""
+"InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` "
+"来管理模型保存。 其中，可以 使用 ``CheckpointManager.try_save_checkpoint(train_state)`` "
+"来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。"
+msgstr ""
+"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation, "
+"we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports "
+"automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit."
+
+#: ../../source/checkpoint.rst:8 a023b5a6d15749bfaa51cf2da194bda1
+msgid "Checkpointing"
+msgstr ""
+
+#: 938575c699d1426c87e0b3f589a85d50
+#: internlm.utils.model_checkpoint.CheckpointManager:1 of
+msgid "StorageManagerContext"
+msgstr ""
+
+#: 754d6881cd034c5ebaab0f3362dd14c2
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:1 of
+msgid ""
+"Exit signal detection function, if we write the exit step in the "
+"'QUIT_FILE_PATH' file, all ranks will save ckpt and exit. Negative "
+"integer step means save ckpt. Positive integer step means save ckpt and "
+"quit."
+msgstr ""
+
+#: 2169f9fb4a8b40bc9bf6093894fc7a5e 6a55d2b2b24a44c8b78b40f19f4d950b
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training of
+msgid "参数"
+msgstr ""
+
+#: 360a89b1591e4627ac432f4d75050354
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
+msgid "返回"
+msgstr ""
+
+#: 2426832f4a8a4c5481be1c940e0e7b50
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:9 of
+msgid "whether to quit."
+msgstr ""
+
+#: 5f6842c261544a3c89f32d981b3ad755
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
+msgid "返回类型"
+msgstr ""
+
+#: 1392da84b6e645bcb8dab605e1231fdc
+#: internlm.utils.model_checkpoint.CheckpointManager.wait_async_upload_finish:1
+#: of
+msgid "wait for all checkpoint uploads to be completed"
+msgstr ""
+
+#: d1774593e9c94608b49b10504bfbc38b
+#: internlm.utils.model_checkpoint.CheckpointManager.query_latest_snapshot_step_boto3:1
+#: of
+msgid ""
+"Returns: Tuple(str, int): path of latest ckpt and ckpt step, if not "
+"found, None will return."
+msgstr ""
+
+#: a3abbbd2bd574872892d908ab248e804
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:1 of
+msgid "Attempt to restore the training state of the last ckpt."
+msgstr ""
+
+#: de021d1eb6d54955a2850c11c0191710
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:3 of
+msgid "lr_scheduler object."
+msgstr ""
+
+#: 20be15854f2e420a9d96c86b5869bfa6
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:5 of
+msgid "optimizer object."
+msgstr ""
+
+#: 68f69086c5054acc8aca15c8a764acc5
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:7 of
+msgid "learning rate."
+msgstr ""
+
+#: 5d34d34a972d4abeab4bda3e49ee157b
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:9 of
+msgid "traing states."
+msgstr ""
+
+#: 82ebb67afaa748ecabc4cef598d7fc30
+#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:11 of
+msgid "traning dataloader object"
+msgstr ""
+
+#: 0c95dfcd712749279daca78166bb4326
+#: internlm.utils.model_checkpoint.CheckpointManager.save_checkpoint:1 of
+msgid "Save checkpoint to the given folder path."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
@ -0,0 +1,50 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/example/30B_demo.rst:2 242d1f89ae2045f1bf1f31bf82f07846
+msgid "30B Demo"
+msgstr ""
+
+#: ../../source/example/30B_demo.rst:5 c2415bfa6978414a939dcc395fdfb544
+msgid "训练配置"
+msgstr "Training Config"
+
+#: ../../source/example/30B_demo.rst:7 75f568d1ca5546228f88958c12c2dd65
+msgid "30B demo 训练配置文件样例如下:"
+msgstr "30B demo config file example:"
+
+#: ../../source/example/30B_demo.rst:164 533cb04f94314eeb8381e45f06d03108
+msgid "启动训练"
+msgstr "Start Training"
+
+#: ../../source/example/30B_demo.rst:166 24974384d5ab42e68266aeb67ae222ce
+msgid "完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动两节点 16GPU 的训练命令如下所示："
+msgstr "After completing the data preparation and relevant training configurations, you can start the demo training.
+The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs."
+
+#: ../../source/example/30B_demo.rst:173 948ac71ed53848f9bad07f69d956c4bb
+msgid "训练结果"
+msgstr "Training Results"
+
+#: ../../source/example/30B_demo.rst:175 615a3481b0aa49729b7219b1365519aa
+msgid "基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下："
+msgstr "Taking the configuration of the demo training on two nodes with 16 GPUs on slurm as an example, the training result log is shown below:"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
@ -0,0 +1,50 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/example/7B_demo.rst:2 8576f969040249bb93e7c347ef210990
+msgid "7B Demo"
+msgstr ""
+
+#: ../../source/example/7B_demo.rst:5 5429ceea12424825991744bece744f60
+msgid "训练配置"
+msgstr "Training Config"
+
+#: ../../source/example/7B_demo.rst:7 c9a47faf5deb40b68ad2bc950fdf2b14
+msgid "7B demo 的训练配置文件样例如下:"
+msgstr "7B demo config file example:"
+
+#: ../../source/example/7B_demo.rst:162 eb93a6ca05c8421eb87a2470f9f31fc2
+msgid "启动训练"
+msgstr "Start Training"
+
+#: ../../source/example/7B_demo.rst:164 9e7a864ae2e14d05b0681f16792e5278
+msgid "完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动单节点 8GPU 的训练命令如下所示："
+msgstr "After completing the data preparation and relevant training configurations, you can start the demo training.
+The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs."
+
+#: ../../source/example/7B_demo.rst:171 fdd053efb1854d46aabf6c0f279fe7fc
+msgid "训练结果"
+msgstr "Training Results"
+
+#: ../../source/example/7B_demo.rst:173 33ec81f34e3c4340beacdb5254069d08
+msgid "基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下："
+msgstr "Taking the configuration of the demo training on a single machine with 8 GPUs on slurm as an example, the training result log is shown below:"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
@ -0,0 +1,33 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/example/index.rst:2 de54695e8bde40ffb8878043072197e6
+msgid "训练样例"
+msgstr "Training Example"
+
+#: ../../source/example/index.rst:5 da388b3209ff4bd39fd0700a7fba413a
+msgid "7B Demo"
+msgstr ""
+
+#: ../../source/example/index.rst:13 b095e27dfc924a7a943b7cba5361700a
+msgid "30B Demo"
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/index.po
@ -0,0 +1,81 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/index.rst:8 11e029810acf410180311a3c63eb01f4
+msgid "InternLM"
+msgstr "InternLM"
+
+#: ../../source/index.rst:11 e6fd7d058e4b43bb81157ac79867e3d3
+msgid "环境构建"
+msgstr "Environment Setup"
+
+#: ../../source/index.rst:19 f323ede90c0f434d8b627eded1d8fc10
+msgid "快速上手"
+msgstr "Quickstart Guide"
+
+#: ../../source/index.rst:27 3c504b4b92264e9182abb0fa81fe80c3
+msgid "训练构建"
+msgstr "Model Setup"
+
+#: ../../source/index.rst:35 5cc5c831399a40b089d27b777a776b16
+msgid "训练 API"
+msgstr "Training API"
+
+#: ../../source/index.rst:43 21a7473eabb441f8bfe28d2a0e306889
+msgid "并行训练"
+msgstr "Parallel Training"
+
+#: ../../source/index.rst:51 9234725f3c464731993d73607608c874
+msgid "模型备份"
+msgstr "Model Checkpointing"
+
+#: ../../source/index.rst:59 8e4ce037017f4510b2892a66003877fa
+msgid "性能分析"
+msgstr "Profiler"
+
+#: ../../source/index.rst:67 a36e02819ecd4b448a8cb4ebbecb6600
+msgid "训练监控"
+msgstr "Monitor"
+
+#: ../../source/index.rst:75 b912e292486f455c8b5cdd75962e8ac2
+msgid "训练样例"
+msgstr "Example"
+
+#: ../../source/index.rst:83 ea9e9281720941a1830e5df7a2badf7a
+msgid "常见问题"
+msgstr "Q&A"
+
+#: ../../source/index.rst:91 e08edc5aa1c74965b10084b393b88fae
+msgid "索引和表格"
+msgstr "Indices and tables"
+
+#: ../../source/index.rst:93 f3fdca059caa49dcad09aa44be7f02d6
+msgid ":ref:`genindex`"
+msgstr ""
+
+#: ../../source/index.rst:94 b3791e811315435097bb507edc3f4b9b
+msgid ":ref:`modindex`"
+msgstr ""
+
+#: ../../source/index.rst:95 a164b772960f4ab8b18c7e8820f69f55
+msgid ":ref:`search`"
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
@ -0,0 +1,204 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-08 15:32+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/initialize.rst:2
+msgid "训练构建"
+msgstr "Training Setup"
+
+#: ../../source/initialize.rst:7
+msgid "命令行参数解析"
+msgstr "Argument Parsing"
+
+#: ../../source/initialize.rst:9
+#, fuzzy
+msgid ""
+"InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_"
+" 库来向InternLM运行时提供命令行参数配置。用户可使用 "
+"``internlm.initialize.get_default_parser()`` 来获取 InternLM "
+"的默认解析器，其中包含一些内置参数，用户可以向此解析器添加自定义参数。"
+msgstr ""
+"InternLM uses the `argparse "
+"<https://docs.python.org/3/library/argparse.html>`_ library to supply "
+"commandline configuration to the InternLM runtime. Use "
+"``internlm.initialize.get_default_parser()`` to get InternLM's default "
+"parser with some builtin arguments, users can add custom parameters to "
+"this parser."
+
+#: internlm.initialize.launch.get_default_parser:1 of
+msgid ""
+"Reads user command line and uses an argument parser to parse the input "
+"arguments. Input arguments include configuration, host, port, world size,"
+" local rank, backend for torch.distributed."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer
+#: internlm.initialize.launch.get_default_parser
+#: internlm.train.training_internlm.get_train_data_loader
+#: internlm.train.training_internlm.initialize_model
+#: internlm.train.training_internlm.initialize_optimizer of
+msgid "返回"
+msgstr ""
+
+#: internlm.initialize.launch.get_default_parser:4 of
+msgid ""
+"Returns the parser with the default arguments, the user may add "
+"customized arguments into this parser."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer
+#: internlm.initialize.launch.get_default_parser
+#: internlm.train.training_internlm.initialize_model of
+msgid "返回类型"
+msgstr ""
+
+#: ../../source/initialize.rst:25
+msgid "模型初始化"
+msgstr "Model Initialization"
+
+#: internlm.train.training_internlm.initialize_model:1 of
+msgid "Initialize model with Automatic Mixed Precision."
+msgstr ""
+
+#: internlm.train.training_internlm.initialize_model:3 of
+msgid "The neural network model to be trained or evaluated."
+msgstr ""
+
+#: ../../source/initialize.rst:29
+msgid "InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下："
+msgstr ""
+"InternLM uses the field ``model_type`` and ``model`` in the config file "
+"to control model initialization process. An example model initialization "
+"configuratio"
+
+#: ../../source/initialize.rst:57
+msgid "字段 ``model_type`` 指明了要初始化的模型类型"
+msgstr ""
+"The field ``model_type`` specifics the model type has been registered and"
+" to be initialized."
+
+#: ../../source/initialize.rst:58
+msgid "字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置"
+msgstr ""
+"The parameters in field ``model`` specific the configuration settings "
+"during model initialization."
+
+#: ../../source/initialize.rst:60
+msgid ""
+"值得注意的是，用户可以定义新的模型类型，并使用装饰器 ``@MODEL_INITIALIZER.register_module`` "
+"注册模型的初始化函数，其中 ``MODEL_INITIALIZER`` 是类 "
+"``internlm.util.registry.Registry`` 的一个实例化对象，示例如下所示："
+msgstr ""
+"It is worth noting that, users can define new model type, and register "
+"model's initialization function by decorater "
+"``@MODEL_INITIALIZER.register_module``, which ``MODEL_INITIALIZER`` is an"
+" instantiated object of class ``internlm.util.registry.Registry``, the "
+"example is shown as follows."
+
+#: ../../source/initialize.rst:72
+msgid "优化器初始化"
+msgstr "Optimizer Initialization"
+
+#: internlm.train.training_internlm.initialize_optimizer:1 of
+msgid "Initialize optimizer."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer
+#: internlm.train.training_internlm.get_train_data_loader
+#: internlm.train.training_internlm.initialize_optimizer of
+msgid "参数"
+msgstr ""
+
+#: internlm.train.training_internlm.initialize_optimizer:3 of
+msgid "Your model instance to be trained or evaluated."
+msgstr ""
+
+#: internlm.train.training_internlm.initialize_optimizer:6 of
+msgid "A tuple of (optimizer, beta2_scheduler, lr_scheduler)."
+msgstr ""
+
+#: ../../source/initialize.rst:79
+msgid "数据加载器初始化"
+msgstr "Dataloader Initialization"
+
+#: internlm.train.training_internlm.get_train_data_loader:1 of
+msgid "Generate and return the training data loader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:3 of
+msgid "number of subprocesses used for dataloader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:5 of
+msgid "generate function for dataset."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:7 of
+msgid "dataset sampler for training dataloader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:9 of
+msgid "collate function for training dataloader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:12 of
+msgid "A tuple of (train_dl, dataset_types)."
+msgstr ""
+
+#: ../../source/initialize.rst:86
+msgid "Trainer 初始化"
+msgstr "Trainer Initialization"
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:1 of
+msgid ""
+"Core function to wrap the essential training components with our "
+"functionality based on the config which is loaded into gpc.config."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:4 of
+msgid "Your model instance or a function to build the model."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:6 of
+msgid "Your optimizer for training."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:8 of
+msgid "Your criterion instance."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:10 of
+msgid "Dataloader for training."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:12 of
+msgid "Dataloader for testing."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:14 of
+msgid "Your lr scheduler instance, optional."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:17 of
+msgid ""
+"A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``"
+" where only ``trainer`` could not be None."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/install.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/install.po
@ -0,0 +1,140 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../../install.md:2 ../../../install.md:28
+#: c237a7328df9440eb54f36c5e6ceef46 e55787faf3f74d5996f251b28422cf15
+msgid "环境安装"
+msgstr "Installation"
+
+#: ../../../install.md:4 d5cd61481eb04f55a9b1636e47e2bc49
+msgid "环境准备"
+msgstr "Environment Preparation"
+
+#: ../../../install.md:5 418763cd4acb4ff3afba059ae7066739
+msgid "首先，需要安装的依赖包及对应版本列表如下："
+msgstr "The required packages and corresponding version are shown as follows:"
+
+#: ../../../install.md:6 dcb95218036f4452a92a5a9c2fdbe337
+msgid "Python == 3.10"
+msgstr ""
+
+#: ../../../install.md:7 79e3d9ff5df7455fa596ba63ce3089b7
+msgid "GCC == 10.2.0"
+msgstr ""
+
+#: ../../../install.md:8 d14840f7b64d4a32a0be5762027e9c32
+msgid "MPFR == 4.1.0"
+msgstr ""
+
+#: ../../../install.md:9 851e3e5c874a4d0f8fd37a4f85ec8f2f
+msgid "CUDA >= 11.7"
+msgstr ""
+
+#: ../../../install.md:10 dbf2012c72e1479ba6647baa047ecc04
+msgid "Pytorch >= 1.13.1"
+msgstr ""
+
+#: ../../../install.md:11 b191e289a079455ea906694a75439b3e
+msgid "Transformers >= 4.28.0"
+msgstr ""
+
+#: ../../../install.md:12 17accf19fe184e3cb704274d8a66e87e
+msgid "Flash-Attention >= v1.0.5"
+msgstr ""
+
+#: ../../../install.md:13 8063cdce4bb94947a07dbaedd97e1013
+msgid "Apex == 23.05"
+msgstr ""
+
+#: ../../../install.md:14 7d6d2682ed214d0cba0048903c128bce
+msgid "Ampere或者Hopper架构的GPU (例如H100, A100)"
+msgstr "GPU with Ampere or Hopper architecture (such as H100, A100)"
+
+#: ../../../install.md:15 91039fb42b94421586c558a2afcbed71
+msgid "Linux OS"
+msgstr ""
+
+#: ../../../install.md:17 694b95a146d54878a4a5d57e0c1e8c6c
+msgid "以上依赖包安装完成后，需要更新配置系统环境变量："
+msgstr "After installing the above dependencies, some system environment variables need to be updated:"
+
+#: ../../../install.md:29 d0ebf84438dc43708ea517c7eff92e79
+msgid "将项目`internlm`及其依赖子模块，从 github 仓库中 clone 下来，命令如下："
+msgstr "Clone the project `internlm` and its dependent submodules from the github repository, as follows:"
+
+#: ../../../install.md:34 c278177fc1974f3fac9b33688d0591fd
+msgid "推荐使用 conda 构建一个 Python-3.10 的虚拟环境， 并基于`requirements/`文件安装项目所需的依赖包："
+msgstr "It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:"
+
+#: ../../../install.md:43 6a152c8e332f47b0ba35a9bcec2ed32d
+msgid "安装 flash-attention (version v1.0.5)："
+msgstr "Install flash-attention (version v1.0.5):"
+
+#: ../../../install.md:55 d7b2116e6ca745ceb48a792fae371283
+msgid "安装 Apex (version 23.05)："
+msgstr "Install Apex (version 23.05):"
+
+#: ../../../install.md:62 8bcbfb9f74de4a2796212a339feb8283
+msgid "环境镜像"
+msgstr "Environment Image"
+
+#: ../../../install.md:63 6cbb97568d704cf19e7dabab20ce1d5b
+msgid ""
+"用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 "
+"https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。"
+msgstr "Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm."
+
+#: ../../../install.md:65 9c29ae2ac9984a8094daf52751f5c7b9
+msgid "镜像配置及构造"
+msgstr "Image Configuration and Build"
+
+#: ../../../install.md:66 12bd6b0729464cb5af663a384dadd0ec
+msgid ""
+"dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build "
+"镜像："
+msgstr "The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:"
+
+#: ../../../install.md:70 b5f42dbca3e340c4bb80de1f502e0700
+msgid ""
+"在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 "
+"ubuntu20.04 和 centos7。"
+msgstr "In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported."
+
+#: ../../../install.md:72 4abb47ce9cf64b3c9b8dc23ace37a826
+msgid "镜像拉取"
+msgstr "Pull Standard Image"
+
+#: ../../../install.md:73 1b6e61b2e0cb4da98f5d70d67ac638f9
+msgid "基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用："
+msgstr "The standard image based on ubuntu and centos has been built and can be directly pulled:"
+
+#: ../../../install.md:82 2bd75cc4b74848c19775e2b1c83726c1
+msgid "容器启动"
+msgstr "Run Container"
+
+#: ../../../install.md:83 4bb2dd4bba904255a204776a50721159
+msgid "对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器："
+msgstr "For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:"
+
+#: ../../../install.md:87 66613606256e4094a6be5ab2af1269ae
+msgid "容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。"
+msgstr "The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md)."
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
@ -0,0 +1,198 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/monitor.rst:2 f95ef3bff8574c77a28ca2f6212cc4b8
+msgid "监控和告警"
+msgstr "Monitor and Alert"
+
+#: ../../source/monitor.rst:5 959bd4a6061f4483875c7950ab4546cf
+msgid "监控"
+msgstr "Monitoring"
+
+#: ../../source/monitor.rst:7 6071bc878d894865b73380cb887847c1
+msgid ""
+"InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` "
+"来初始化上下文监控管理。其中，一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` "
+"将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。"
+msgstr ""
+"InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` to initialize context monitor. During this time, "
+"a singleton ``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread and track training status "
+"with ``internlm.monitor.monitor.MonitorTracker``."
+
+#: 9256a063b6dd449786f29e03ce085176
+#: internlm.monitor.monitor.initialize_monitor_manager:1 of
+msgid ""
+"Initialize monitor manager for monitoring training lifetime and alerting "
+"exception info to Feishu."
+msgstr ""
+
+#: 138340fca72a4226be901f7f16c8a590 904b7938fdea46bf81c1ef738aa7bfae
+#: 9ed2a7b4af2243b289e72b2751aec902 aa0dd0dc6bee4a5bb15cc9705f7c13ee
+#: internlm.monitor.alert.send_feishu_msg_with_webhook
+#: internlm.monitor.monitor.MonitorManager.start_monitor
+#: internlm.monitor.monitor.MonitorTracker
+#: internlm.monitor.monitor.initialize_monitor_manager of
+msgid "参数"
+msgstr ""
+
+#: 3b302339e1d143b6b1d782ff59c9396d 6a06f053828b4c80aef56970750e2085
+#: internlm.monitor.monitor.MonitorManager.start_monitor:3
+#: internlm.monitor.monitor.initialize_monitor_manager:3 of
+msgid "The training job name."
+msgstr ""
+
+#: 3330d06145ee4d35b0b3632e799a35b3 c105473f2f6a4f838a9f0d098762d698
+#: internlm.monitor.monitor.MonitorManager.start_monitor:5
+#: internlm.monitor.monitor.initialize_monitor_manager:5 of
+msgid "The Feishu webhook address for sending alert messages."
+msgstr ""
+
+#: 774c6ff82a2e452295a1a7dcabaded3d internlm.monitor.monitor.MonitorManager:1
+#: of
+msgid ""
+"Monitor Manager for managing monitor thread and monitoring training "
+"status."
+msgstr ""
+
+#: 72e696c0ce8f41ea8c7947d35cf322f0
+#: internlm.monitor.monitor.MonitorManager.monitor_loss_spike:1 of
+msgid "Check loss value, if loss spike occurs, send alert message to Feishu."
+msgstr ""
+
+#: 2b668b057fa84e8b92c65bfd49bfb3e9
+#: internlm.monitor.monitor.MonitorManager.monitor_exception:1 of
+msgid "Catch and format exception information, send alert message to Feishu."
+msgstr ""
+
+#: 9852b7143026476d89e1a175223e6d79
+#: internlm.monitor.monitor.MonitorManager.handle_sigterm:1 of
+msgid "Catch SIGTERM signal, and send alert message to Feishu."
+msgstr ""
+
+#: 2e3827bad7b1445fb0d9a7c5a28def5d
+#: internlm.monitor.monitor.MonitorManager.start_monitor:1 of
+msgid ""
+"Initialize and start monitor thread for checking training job status, "
+"loss spike and so on."
+msgstr ""
+
+#: 271cc3e1b0834a7ba6a1ba4d5cce0ef1
+#: internlm.monitor.monitor.MonitorManager.start_monitor:7 of
+msgid "The time of monitor interval in seconds, defaults to 300."
+msgstr ""
+
+#: e4a06091fce8401b83e31ce26c8075a0
+#: internlm.monitor.monitor.MonitorManager.start_monitor:9 of
+msgid ""
+"The limit multiple of current loss to previous loss value, which means "
+"loss spike may be occurs, defaults to 1.5."
+msgstr ""
+
+#: 28bde748477e41f39fa6ca3e1855923d
+#: internlm.monitor.monitor.MonitorManager.stop_monitor:1 of
+msgid "Stop the monitor and alert thread."
+msgstr ""
+
+#: ffb3dda227664748bdb326b6630bc827 internlm.monitor.monitor.MonitorTracker:1
+#: of
+msgid "Track job status and alert to Feishu during job training."
+msgstr ""
+
+#: a1e93683cbb04d8ab825e2776e76efa7 internlm.monitor.monitor.MonitorTracker:3
+#: of
+msgid "The Feishu webhook address for sending alerting messages."
+msgstr ""
+
+#: 7913eeecc0904c128046e80cec1553f2 internlm.monitor.monitor.MonitorTracker:5
+#: of
+msgid "The interval in seconds for monitoring checks. Defaults to 300."
+msgstr ""
+
+#: 8d1abc3067584866983139dd3d85c59c internlm.monitor.monitor.MonitorTracker:7
+#: of
+msgid "The threshold for detecting loss value spikes. Defaults to 1.5."
+msgstr ""
+
+#: a0416fd68700450793daa2167f776618
+#: internlm.monitor.monitor.MonitorTracker.run:1 of
+msgid "start the monitor tracker."
+msgstr ""
+
+#: f55eb990c07b4e8f9388236dd60f0017
+#: internlm.monitor.monitor.MonitorTracker.stop:1 of
+msgid "Stop the monitor tracker."
+msgstr ""
+
+#: ../../source/monitor.rst:18 2202bc091aab417097a1b0268dfe6785
+msgid "告警"
+msgstr "Alerting"
+
+#: ../../source/monitor.rst:20 69334f83e644455aa619dde70b8ed1f2
+msgid ""
+"InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等，并捕获 "
+"SIGTERM 异常信号。当出现上述情况时，将触发警报，并通过调用 "
+"``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook "
+"地址发送报警消息。"
+msgstr ""
+"InternLM monitor thread periodically tracks loss spike, potential stuck condition, runtime exception, and SIGTERM signal. "
+"When above situation occurs, an alert will be triggered and a message will be sent to the Feishu webhook address by calling "
+"``internlm.monitor.alert.send_feishu_msg_with_webhook()``."
+
+#: 15980526c2fa4ed8befa1604f271a3f1
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:1 of
+msgid "Use Feishu robot to send messages with the given webhook."
+msgstr ""
+
+#: 38e5738c2b914c8096e1a0f345e6c0b4
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:3 of
+msgid "The webhook to be used to send message."
+msgstr ""
+
+#: 4984f1a3bb0d46b48b2aad4fba8b43d9
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:5 of
+msgid "The message title."
+msgstr ""
+
+#: a9822a4cf30d4947b12f70a0efe62a5e
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:7 of
+msgid "The message body."
+msgstr ""
+
+#: 57d9ab65fe9f45c28351839fecf2f31e
+#: internlm.monitor.alert.send_feishu_msg_with_webhook of
+msgid "返回"
+msgstr ""
+
+#: 2b6ac97fd152498183a8624a9087812b
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:10 of
+msgid "The response from the request. Or catch the exception and return None."
+msgstr ""
+
+#: ec45dedf976046eb909f5b7f79a7d44c
+#: internlm.monitor.alert.send_feishu_msg_with_webhook of
+msgid "抛出"
+msgstr ""
+
+#: 4c6aeec19a6041cfbfa577b1c5a85ac1
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:12 of
+msgid "An exception rasied by the HTTP post request."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
@ -0,0 +1,457 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/parallel.rst:2 28d82a05db464e35aa3ec83e36597214
+msgid "并行训练"
+msgstr "Parallel Training"
+
+#: ../../source/parallel.rst:6 f5c2eef4812640fca0aeaef62a2d85d4
+msgid ""
+"InternLM 支持张量并行、流水线并行、序列并行、数据并行和 ZeRO1.5 "
+"等并行化训练策略。在初始化分布式环境时，我们需要指定张量并行大小、流水线并行大小、数据并行大小以及 ZeRO1.5 策略。"
+msgstr ""
+"InternLM supports tensor parallel, pipeline parallel, sequence parallel, data parallel, and ZeRO1.5 "
+"to parallelize the training pipeline. When initializing the distributed environment, we need to specify "
+"tensor parallel size, pipeline parallel size, data parallel size, and ZeRO1.5 strategy."
+
+#: ../../source/parallel.rst:8 649c52696a734a0c86d3d5377193aba5
+msgid ""
+"InternLM 的并行设置由配置文件中的 ``parallel`` 字段指定，用户可以通过修改配置文件 `config file "
+"<https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_ "
+"来更改并行配置。以下是一个并行训练配置示例："
+msgstr ""
+"The parallel setting of InternLM is fully config-driven, and you can change the parallelism by modifying "
+"`config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_. An exmaple parallel "
+"training configuration can be defined as follows:"
+
+#: ../../source/parallel.rst:19 a06ae11e51ea479b9501ada103c9d071
+msgid "zero1：zero 并行策略，分如下三种情况，默认值为 -1"
+msgstr "zero1: zero parallel strategy, divided into the following three cases, the default value is -1"
+
+#: ../../source/parallel.rst:21 08005d5cdde84057b870495d9683c7be
+msgid "当 ``zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
+msgstr "When ``zero1 <= 0``, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range."
+
+#: ../../source/parallel.rst:22 fe30803c0aec4b70847ac40b68641e05
+msgid "当 ``zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
+msgstr "When ``zero1 == 1``, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters."
+
+#: ../../source/parallel.rst:23 e0acea7d80094e018fab75404ec25163
+msgid ""
+"当 ``zero1 > 1`` 且 ``zero1 <= data_parallel_world_size``，则 zero1 "
+"进程组是数据并行进程组的子集"
+msgstr "When ``zero1 > 1`` and ``zero1 <= data_parallel_world_size``, the zero1 process group is a subset of the data parallel process group."
+
+#: ../../source/parallel.rst:25 17bba79e2e884993a602df9cf20d2489
+msgid "tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1"
+msgstr "tensor: tensor parallel size, usually the number of GPUs per node, the default value is 1"
+
+#: ../../source/parallel.rst:26 3bda721a03a144f28f33d360a87cbf83
+msgid "pipeline：流水线并行策略"
+msgstr "pipeline: pipeline parallel strategy"
+
+#: ../../source/parallel.rst:28 2b10f2b57ef64fcc872d036a7ad82b03
+msgid "size：流水线并行大小，默认值为 1"
+msgstr "size: pipeline parallel size, the default value is 1"
+
+#: ../../source/parallel.rst:29 49c8a409e60244c49514a27780ae39a3
+msgid "interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为 False"
+msgstr "interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False"
+
+#: ../../source/parallel.rst:31 e4ff81960c434b78847174787f0423e2
+msgid "sequence_parallel：是否开启序列化并行，默认值为 False"
+msgstr "sequence_parallel: whether to enable sequence parallelism, the default value is False"
+
+#: ../../source/parallel.rst:33 a24f4bc81fea48619ae2720e0cb6a392
+msgid "注意：数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小"
+msgstr "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`"
+
+#: ../../source/parallel.rst:36 a93fc45f855c4ca7901ccbe23bf14edc
+msgid "张量并行"
+msgstr "Tensor Parallel"
+
+#: ../../source/parallel.rst:38 cce9e8f3c8f14c1c96c63273baceb164
+msgid ""
+"InternLM 的张量并行实现方案基于 `flash attention <https://github.com/Dao-AILab"
+"/flash-attention>`_, 主要对 `attention "
+"<https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_"
+" 和 `linear "
+"<https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_"
+" 这两个模块进行张量并行操作。"
+msgstr ""
+"The implementation of tensor parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_, "
+"which has tensor parallel extensions to parallelize `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ "
+"and `linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ blocks in InternLM model. "
+
+#: ../../source/parallel.rst:41 f98a4b36ffdf4381a03899b605346be6
+msgid "用户可通过配置文件中的 ``parallel.tensor`` 字段来设置张量并行大小。"
+msgstr "To use tensor parallel, you need to set the value of tensor parallel size ``parallel.tensor`` in the config file, which is usually the number of GPUs per node."
+
+#: ../../source/parallel.rst:47 956804e7cde441989212f7eb505e8815
+msgid "张量并行，采用自 `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_"
+msgstr "Tensor parallel, adopted from `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_"
+
+#: ../../source/parallel.rst:50 a6424fd0ff0246fcadf56436260fadb6
+msgid "流水线并行"
+msgstr "Pipeline Parallel"
+
+#: ../../source/parallel.rst:52 f2c163418fed432a8f3f59f1a5229e88
+msgid ""
+"InternLM 在流水线并行中使用 `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ "
+"（1F1B，一次前向传递后跟一次反向传递）策略。对于 1F1B 策略，有两种实现方式："
+msgstr "InternLM uses `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ (one forward pass followed by one backward pass) for pipeline parallel. For 1F1B strategy, there are two implementations:"
+
+#: ../../source/parallel.rst:54 43f3b988e2924fe9968b9d049b46ffa0
+msgid "非交错调度器，内存高效。"
+msgstr "non-interleaved scheduler, which is memory-efficient"
+
+#: ../../source/parallel.rst:55 7a45446082c441d48d49b6be661ea8d2
+msgid "交错调度器，内存高效且时间高效（GPU空泡较少）。"
+msgstr "interleaved scheduler, which is both memory-efficient and time-efficient."
+
+#: ../../source/parallel.rst:61 92f2a168d7794811b56f9bb3bc170982
+msgid "1F1B 流水线并行调度器，采用自 `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_"
+msgstr "Non-interleaved and interleaved scheduler for 1F1B pipeline parallelism, adopted from `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_"
+
+#: ../../source/parallel.rst:64 a6d3df0b74b14b158a04ddda3e904004
+msgid "非交错式流水线调度"
+msgstr "scheduler for non-interleaved 1F1B strategy"
+
+#: ../../source/parallel.rst:65 1fa48743f39a44a29d78fb7f9eed5a52
+msgid "如果要使用非交错式调度, 需要设置 ``model.num_chunks = 1``。"
+msgstr "To use non-interleaved pipeline scheduler, users need to set ``model.num_chunks = 1`` in the config file."
+
+#: 57206dc0bc734686841c363c88839708
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:1 of
+msgid ""
+"A helper schedule class for pipeline parallelism running environment. It "
+"uses non-interleaved 1F1B strategy. Other properties are similar as "
+":class:`NonPipelineSchedule`."
+msgstr ""
+
+#: 6475fee6f3cd462ba1073a641b322e12 7060a021efb0459598f49f74e8e7185b
+#: 9218fee47e5542cab88ac65ff0054068 d1be8d5479fb48f59be379548ee24bd9
+#: d41da940b4a84cd0822c3f94c2eaf344 f5654fe6eacc49dba5baa1d058df5d29
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad of
+msgid "参数"
+msgstr ""
+
+#: 567e2a87a45245469af9f8709e020a20
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:5 of
+msgid "The number of microbatches."
+msgstr ""
+
+#: 6d3b2256ea9c4897bf72f551f8b4696b
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:7 of
+msgid "Type of data. torch.float by default."
+msgstr ""
+
+#: 6e36198f5ed344f7ad02f56aec9a333c
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:9 of
+msgid ""
+"The post processing function which receives a micro batch of data, and it"
+" will be executed in `load_micro_batch`."
+msgstr ""
+
+#: ffae9611bd854615af1ced927f72c556
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:12 of
+msgid "Specified shape in pipeline communication."
+msgstr ""
+
+#: 31d45af550334cb8a94142da335b9724
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:14 of
+msgid ""
+"If set to `True`, communication will be reduced over pipeline when using "
+"1D tensor parallelization."
+msgstr ""
+
+#: 5c852dc7866f4e50ab87c15b86d338f2
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:16 of
+msgid "List of scheduler hooks."
+msgstr ""
+
+#: 4ebec38a972b4c31a59f1fc824d51f62
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing:1
+#: of
+msgid "To perform actions before running the schedule."
+msgstr ""
+
+#: d491d0dfa1bf41708150cc57567ac0f0
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing:3
+#: of
+msgid "InternLM engine for training and inference."
+msgstr ""
+
+#: bc5dc62440b94825b192ad2e28641976
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:1
+#: of
+msgid ""
+"Runs non-interleaved 1F1B schedule, with communication between pipeline "
+"stages. Returns a tuple with losses if the last stage, an empty tuple "
+"otherwise."
+msgstr ""
+
+#: 765809e448b644678a9fb822f6427a94 99c948f562e343aabdecac2d43650f59
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:4
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:4
+#: of
+msgid "Colossalai engine for training and inference."
+msgstr ""
+
+#: 31af7a46c5a645628bea05ad35757dcf 4ea88ec52c5b4df79a57ab2d217de697
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:6
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:6
+#: of
+msgid ""
+"Dataloader as the form of an iterator, obtained by calling "
+"iter(dataloader)."
+msgstr ""
+
+#: 2deff747718449fabc5b47a1de0be52e e0d2e154ac134da28470924aa65342a1
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:8
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:8
+#: of
+msgid ""
+"Whether run forward step only. Default is false. If true, no backward "
+"will be run."
+msgstr ""
+
+#: 71aa2b45248c4af28525dbc1ba4a1aff d3b3c1e350334dd2a16cbb2e8c8d339a
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:10
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:10
+#: of
+msgid "Whether returns the loss value. Default is true."
+msgstr ""
+
+#: 2021eaca687148539b03f6b0b1c118c8 5c138015fb254eccae2f0df2dab45629
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:12
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:12
+#: of
+msgid "If False, the output and label won't be returned."
+msgstr ""
+
+#: 57a86115b88541b1a7220d9535058607 5dabcd12b6d844aab8039b022ad0cf1c
+#: b8ccfee837a242a3abbdf9e15eaa53d8
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step of
+msgid "返回"
+msgstr ""
+
+#: 7dc47f5518e64d1095a6051184985f17 fe678c953e8149a5ade387e95d10d3b2
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:17
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:15
+#: of
+msgid "A tuple of (output, label, loss), loss and label could be None."
+msgstr ""
+
+#: a50c7c3d40e14ba8a5af06aa0cb031cb ea3574b76d604402a41fcd3874d05c9a
+#: fa12b183c7534a20b61445eb9f2a2a7a
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step of
+msgid "返回类型"
+msgstr ""
+
+#: 82936eed6da5408c9361732f8fd5cb93 c46a28c21ca149d98ff625b7fdad4c03
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:19
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:16
+#: of
+msgid "Tuple[:class:`torch.Tensor`]"
+msgstr ""
+
+#: ../../source/parallel.rst:71 d2bfdbbd9a7641c38e6957a72ac6bc97
+msgid "交错式流水线调度"
+msgstr "scheduler for interleaved 1F1B strategy"
+
+#: ../../source/parallel.rst:72 395c484fef984a65a284147dc3056241
+msgid "如果要使用交错式调度, 需要设置 ``model.num_chunks > 1``。"
+msgstr "To use interleaved pipeline scheduler, users need to set ``model.num_chunks > 1`` in the config file."
+
+#: 036fffe3aacc4400af38ce5252840a50
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler:1 of
+msgid "Interleaved Pipeline Scheduler."
+msgstr ""
+
+#: 1b6e63b4004e44999e3ad38382b4e308
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:1
+#: of
+msgid ""
+"Run interleaved 1F1B schedule (model split into model chunks), with "
+"communication between pipeline stages as needed."
+msgstr ""
+
+#: 6ece1dfcdb5e408db4870d6c0f524787
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:15
+#: of
+msgid ""
+"A tuple of (output, label, loss), loss and label could be None.     The "
+"loss would be returned only in the last stage."
+msgstr ""
+
+#: ed7e5a4826f84e9eb2840e494761437f
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:18
+#: of
+msgid "The loss would be returned only in the last stage."
+msgstr ""
+
+#: ../../source/parallel.rst:77 1b771fea1d434f0b8b118f1b5344dde4
+msgid "值得注意的是，在使用交错式流水线调度器时可启用通信优化功能，即在 1F1B 阶段启用异步通信，以充分利用上行/下行带宽并实现通信与计算重叠。"
+msgstr "Asynchronous communication will be enabled in 1F1B stage to make full use of uplink/downlink bandwidth and achieve communication overlap. "
+
+#: ../../source/parallel.rst:79 27430e179b454d48a052b9fe6e11ecae
+msgid ""
+"用户需要在配置文件中设置 ``parallel.pipeline.interleaved_overlap = "
+"True``。该功能启用后，将调用函数 "
+"``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap``，并创建 "
+"``internlm.core.communication.AsynCommunicator`` 以管理异步通信。"
+msgstr ""
+"When ``parallel.pipeline.interleaved_overlap = True``, function ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap`` will be called and "
+"``internlm.core.communication.AsynCommunicator`` will be created for managing async communication."
+
+#: ../../source/parallel.rst:81 4e0b6269ca48430098ed4619d0f0f22f
+msgid "``1F1B-without-overlap`` 和 ``1F1B-with-overlap`` 的区别如下所示："
+msgstr "The difference between 1F1B stage without overlap and 1F1B stage with overlap is shown as follows:"
+
+#: ../../source/parallel.rst:102 8412b1f6f51c479d9cbb281763215327
+msgid "序列并行"
+msgstr "Sequence Parallel"
+
+#: ../../source/parallel.rst:104 45aea8164dd244e5a730881c693eeecf
+msgid ""
+"序列并行是一种在不引入额外计算、通信和内存开销的情况下，减少层 ``layer_norm`` 和 ``dropout`` "
+"操作中的激活值内存。InternLM 中的序列并行实现基于 `flash attention <https://github.com/Dao-"
+"AILab/flash-attention>`_。这个并行策略有助于降低模型的内存消耗，提高了模型在资源受限环境中的可扩展性。"
+msgstr ""
+"Sequence parallel is a technique to reduce activation memory in layer norm and dropout without additional computation, "
+"communication or memory overhead. The implementation of sequence parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_. "
+
+#: ../../source/parallel.rst:106 29836b441ee84df6a6dbe877930ba911
+msgid "如果要启用序列并行, 用户需要设置 ``parallel.sequence_parallel = True``。"
+msgstr "To enable sequence parallel, you need to set ``parallel.sequence_parallel = True`` in the config file."
+
+#: ../../source/parallel.rst:112 eadcd6e77c2547998b4e132939a15856
+msgid "序列并行, 采用自 flash-attention"
+msgstr "Sequence parallel, adopted from flash-attention"
+
+#: ../../source/parallel.rst:115 47a0ac84251949fab0d9d8d34efb8751
+msgid "数据并行"
+msgstr "Data Parallel"
+
+#: ../../source/parallel.rst:117 938ad5a1cbc846bab36e8d2f4804a685
+msgid "InternLM 支持数据并行。数据并行大小为:"
+msgstr "InternLM supports data parallel. For data parallel:"
+
+#: ../../source/parallel.rst:119 1e8691a5ff4a4b40ae24815c681f7306
+msgid ""
+"`Data parallel size = Total number of GPUs / Pipeline parallel size / "
+"Tensor parallel size`"
+msgstr ""
+
+#: ../../source/parallel.rst:122 c417e2af4e8e45ca8ca18ad39e96dadd
+msgid "ZeRO1.5"
+msgstr ""
+
+#: ../../source/parallel.rst:124 9c05b4baf8a04e4b8a0f204c4e30cc9c
+msgid ""
+"ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zero1`` "
+"启用了本地节点内的分片。这个方法有助于有效管理和分配模型参数和梯度，以减少内存使用并提高训练效率。"
+msgstr "The implementation of ZeRO1.5 uses the concept of hierarchical sharding via config value ``parallel.zero1``, which enables sharding within local nodes."
+
+#: ../../source/parallel.rst:126 48c994fe37d54c35bbf81f4be070e151
+msgid "当 ``parallel.zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
+msgstr "If ``parallel.zero1 <= 0``, the size of the zero process group is equal to the size of the dp process group, so parameters will be divided within the range of dp."
+
+#: ../../source/parallel.rst:127 3d31193758e24a08b1e90eae21259f71
+msgid "当 ``parallel.zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
+msgstr "If ``parallel.zero1 == 1``, zero is not used, and all dp groups retain the full amount of model parameters."
+
+#: ../../source/parallel.rst:128 fb5c43d2ac75423cabc12ba1512df25e
+msgid ""
+"当 ``parallel.zero1 > 1`` 且 ``parallel.zero1 <= "
+"data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集"
+msgstr "If ``parallel.zero1 > 1`` and ``parallel.zero1 <= dp world size``, the world size of zero is a subset of dp world size. For smaller models, it is usually a better choice to split the parameters within nodes with a setting ``parallel.zero1 <= 8``."
+
+#: ../../source/parallel.rst:130 47f03cea956a4477854591363359cdb3
+msgid ""
+"此外，用户可以在配置文件中通过 ``hybrid_zero_optimizer`` "
+"字段启用优化器的通信优化功能，设置桶大小，以及梯度剪裁等参数。这些设置有助于优化训练过程中的通信和计算效率，以及梯度的处理方式。"
+msgstr "Furthermore, you can enable communication-computation overlap, set bucket reduce size, gradient clipping parameters in the config file."
+
+#: ../../source/parallel.rst:144 dfc63103d4e341ccb7df8ef031e29f4e
+msgid "这里有两个值得关注的通信优化点："
+msgstr "There are two communication optimizations worth paying attention to here:"
+
+#: ../../source/parallel.rst:146 e4815f887d8f48368be01339b5e64d18
+msgid ""
+"overlap_sync_grad: 如果设置为 ``True``，则将训练的 ``backward pass`` 与梯度的 ``all-"
+"reduce`` 通信重叠"
+msgstr "overlap_sync_grad: If set True, overlapping training backward pass with gradients' all-reduce communication."
+
+#: ../../source/parallel.rst:147 bcb1aedd8a89441488b211cd81d4f80c
+msgid ""
+"overlap_sync_param: 如果设置为 ``True``，则将参数的 ``broadcast`` 通信与下一步的 ``forward "
+"pass`` 进行重叠"
+msgstr "overlap_sync_param: If set True, overlapping parameters' broadcast communication with next step's forward pass."
+
+#: ../../source/parallel.rst:149 3ba64e4762084e93ba62a70c909e7d82
+msgid "这些优化可以加速训练过程，提高训练效率。"
+msgstr "These optimizations can speed up the training process and improve training efficiency."
+
+#: 757dad6b9916403c83042b49eaa35ae5
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer:1 of
+msgid "Hybrid Zero Optimizer."
+msgstr ""
+
+#: 83bcd49c056446f6806a55e6138579f2
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad:1
+#: of
+msgid ""
+"Set parameter gradients to zero. If set_to_none = True, gradient will be "
+"set to None to save memory."
+msgstr ""
+
+#: 2d3da89d360c458f80844f9caed6c316
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad:4
+#: of
+msgid "Whether set the gradient to None. Default value is True."
+msgstr ""
+
+#: 4164523156dc460cbbeaa17feed3c689
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:1 of
+msgid "Performs a single optimization step."
+msgstr ""
+
+#: 5c68dace1ec649bfa849b6652051daac
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:3 of
+msgid "A closure that reevaluates the model and returns the loss."
+msgstr ""
+
+#: 91e366d604ce48afa6b92666ece87b85
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:7 of
+msgid "Whether the gradient is success updated, and the gradient."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
@ -0,0 +1,166 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-08 15:32+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/profiler.rst:2
+msgid "性能分析"
+msgstr "Profiler"
+
+#: ../../source/profiler.rst:7
+msgid "Torch Profiler"
+msgstr ""
+
+#: ../../source/profiler.rst:9
+msgid ""
+"InternLM 使用 ``internlm.train.initialize_llm_profile()`` "
+"来收集和分析模型训练或推理期间的性能数据，如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler "
+"<https://pytorch.org/docs/stable/profiler.html>`_ ，输出的性能分析 trace 文件可以使用 "
+"`tensorboard <https://www.tensorflow.org>`_ 进行可视化。"
+msgstr ""
+"InternLM uses ``internlm.train.initialize_llm_profile()`` to profile "
+"performance data, execution time duration and breakdown analysis of step "
+"time. The implementation is based on `torch.profiler "
+"<https://pytorch.org/docs/stable/profiler.html>`_ and output tracing "
+"files can be visualized with `tensorboard <https://www.tensorflow.org>`_."
+
+#: ../../source/profiler.rst:11
+msgid ""
+"用户如果想使用这个 torch 性能分析工具，需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch "
+"性能分析后，用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` "
+"文件夹中看到性能分析结果。"
+msgstr ""
+"To use this torch profiler tool, you need to enable profiling by passing "
+"the ``--profiling`` flag when starting training. After torch profiling is"
+" completed, you can find the profiling results in the "
+"``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` folder."
+
+#: ../../source/profiler.rst:13
+msgid "实际运行生成的 ``Torch Profiler`` 目录结构如下："
+msgstr "The directory structure of ``Torch Profiler`` generated files is as follows:"
+
+#: ../../source/profiler.rst:22
+msgid "其中， ``traces`` 可以通过 ``TensorBoard`` 可视化，运行命令"
+msgstr "Among them, ``traces`` can be visualized through ``TensorBoard`` and run with the command"
+
+#: ../../source/profiler.rst:29
+msgid ""
+"在打开的 ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` "
+"页面可以看到Operator和GPU Kernel的性能分析时间线如下，更多的功能请参考 `torch profiler with "
+"tensorboard "
+"<https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html"
+"#pytorch-profiler-with-tensorboard>`_"
+msgstr "In the opened ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` page, you can see the timeline of profiled operators and GPU kernels. For more usage, please refer to `torch profiler with tensorboard <https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard>`_"
+
+#: internlm.train.training_internlm.initialize_llm_profile:1 of
+msgid "Initialize and return the profiler context manager instance."
+msgstr ""
+
+#: ../../source/profiler.rst:38
+msgid "Memory Profiler"
+msgstr ""
+
+#: ../../source/profiler.rst:40
+msgid ""
+"InternLM 提供了一个实用的内存分析工具 "
+"``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` 来监控实际的 GPU"
+" 内存使用情况。在实现中，会对模型数据（包括模型参数、模型梯度和优化器状态）和非模型数据（包括激活值）分别进行详细的统计。"
+msgstr ""
+"InternLM provides a practical solution "
+"``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` to monitor"
+" actual GPU memory usage. In the implmentation, model data (including "
+"model parameters, model gradients, and optimizer states) and non-model "
+"data (including activations) are calculated."
+
+#: ../../source/profiler.rst:42
+msgid ""
+"要使用这个内存分析工具，用户需要在启动训练时传递 ``--profiling`` 参数以启用内存分析。完成内存分析后，用户可以在 "
+"``memory_trace/rank{}_dp{}_tp{}`` 文件夹中找到特定 rank "
+"对应的内存分析结果（包括不同时间点的内存使用日志和显示总体内存使用情况的太阳图表）。"
+msgstr ""
+"To use this memory profiler tool, you need to enable profiling by passing"
+" the ``--profiling`` flag when starting training. After memory profiling "
+"is completed, you can find the profiling results (including logs of "
+"memory usage at different time point and sunburst charts showing overall "
+"memory usage) for a specific rank device in the "
+"``memory_trace/rank{}_dp{}_tp{}`` folder."
+
+#: ../../source/profiler.rst:44
+msgid "实际运行生成的 ``memory_trace`` 目录结构如下："
+msgstr "The directory structure of ``memory_trace`` generated files is as follows:"
+
+#: ../../source/profiler.rst:107
+msgid "其中， ``memory.log`` 的内容示例如下："
+msgstr "An example of ``memory.log`` is as follows:"
+
+#: ../../source/profiler.rst:157
+msgid "模型参数的太阳图示例如下："
+msgstr "An example of model parameters sunburst chart is as follows:"
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:1 of
+msgid "A memory profiler for a llm model."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point of
+msgid "参数"
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:3 of
+msgid "The model to profile."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:5 of
+msgid "The optimizer used for training the model."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:7 of
+msgid "The file to write the memory state information to."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:9 of
+msgid "number of steps to trace."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:1 of
+msgid "Record the memory state."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:3 of
+msgid "The options to include in the memory state. Defaults to \"\"."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:5 of
+msgid "Whether to create a new memory record file. Defaults to False."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step of
+msgid "返回"
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:8
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:3 of
+msgid "None"
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:1 of
+msgid "Update the memory state of the optimizer state."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/qa.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/qa.po
@ -0,0 +1,25 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/qa.rst:2 e3b22a39640a40cfb527068a7f4bbfc9
+msgid "问&答"
+msgstr "Q&A"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/training.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/training.po
@ -0,0 +1,127 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/training.rst:2 6eafa5eb08e040039309a39cdb0f1bfe
+msgid "训练 API"
+msgstr "Training API"
+
+#: ../../source/training.rst:4 74d81f3d0ca54c839d4e80bd589aedb2
+msgid ""
+"InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` "
+"管理。在定义了训练引擎和调度器之后，我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。"
+msgstr ""
+"InternLM training API is managed in ``internlm.core.trainer.Trainer``. After defining the "
+"training engine and runtime scheduler, we can call training API to perform training, evaluation, "
+"zero gradients and parameter update steps."
+
+#: ../../source/training.rst:6 0e0cfddbb2334d3da99d3289edf4161d
+msgid "有关详细用法，请参阅 Trainer API 文档和示例。"
+msgstr "For detailed usage, please refer to Trainer API documentation and examples."
+
+#: 7ea10280a8f1489984cb9994aa08976b internlm.core.trainer.Trainer:1 of
+msgid ""
+"This is a class tending for easy deployments of users' training and "
+"evaluation instead of writing their own scripts."
+msgstr ""
+
+#: 7969dca55840451193bffd3b071ab3b3 aff576168b59460491bb5da0ce41ea74
+#: internlm.core.trainer.Trainer internlm.core.trainer.Trainer.execute_schedule
+#: of
+msgid "参数"
+msgstr ""
+
+#: 59754d3e9ee8452a872bf397c01e0d8c internlm.core.trainer.Trainer:4 of
+msgid "Engine responsible for the process function."
+msgstr ""
+
+#: 2d18ff15256e48f98901c7a7e0cbbe35 internlm.core.trainer.Trainer:6 of
+msgid "Runtime schedule. Defaults to None."
+msgstr ""
+
+#: 76f4b3c7feba40eca3ee2b32559c53f5 internlm.core.trainer.Trainer.engine:1 of
+msgid ""
+"Returns the engine that responsible for managing the training and "
+"evaluation process."
+msgstr ""
+
+#: c7eae2d4d06c4ef891e314902d80b7f3 internlm.core.trainer.Trainer.schedule:1 of
+msgid "Returns the runtime scheduler."
+msgstr ""
+
+#: cb495b21b3444881aec83803e92386d9
+#: internlm.core.trainer.Trainer.uses_pipeline:1 of
+msgid "Returns whether the pipeline parallel is used or not."
+msgstr ""
+
+#: 86b0b631189e46468281a397c5e97350 internlm.core.trainer.Trainer.train:1 of
+msgid "Sets the model to training mode."
+msgstr ""
+
+#: f997e13120ee4d8b9e45ea6698b3e2a6 internlm.core.trainer.Trainer.eval:1 of
+msgid "Sets the model to evaluation mode."
+msgstr ""
+
+#: a8179e50312d47dcbe9de0433a65c2f7 internlm.core.trainer.Trainer.zero_grad:1
+#: of
+msgid "Sets the gradient of all parameters in the model to zero."
+msgstr ""
+
+#: f936136ef9e0452ca439b7c66dc8884b internlm.core.trainer.Trainer.step:1 of
+msgid "Executes the parameter update step."
+msgstr ""
+
+#: 250e2af89cfd432c84d228f9e03c174c
+#: internlm.core.trainer.Trainer.execute_schedule:1 of
+msgid ""
+"Runs the forward, loss computation, and backward for the model. Returns a"
+" tuple of (output, label, loss)."
+msgstr ""
+
+#: 6ca7de83033b432792eb0d7935ea04da
+#: internlm.core.trainer.Trainer.execute_schedule:4 of
+msgid "The data iterator."
+msgstr ""
+
+#: 6d3044e75b3149beba3c659e15607b79
+#: internlm.core.trainer.Trainer.execute_schedule:6 of
+msgid "Additional keyword arguments."
+msgstr ""
+
+#: 99d5a297d6414c30b432acf2566f0d3c
+#: internlm.core.trainer.Trainer.execute_schedule of
+msgid "返回"
+msgstr ""
+
+#: b625ebf0cf874edba384456d33e740b4
+#: internlm.core.trainer.Trainer.execute_schedule:8 of
+msgid "A tuple of (output, label, loss)."
+msgstr ""
+
+#: 391cde57d2e2478d8f83a7ad270c2a65
+#: internlm.core.trainer.Trainer.execute_schedule of
+msgid "返回类型"
+msgstr ""
+
+#: d4c4fb0fbddb499786970509cf0c9e13
+#: internlm.core.trainer.Trainer.execute_schedule:9 of
+msgid "Tuple[:class:`torch.Tensor`]"
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
@ -0,0 +1,365 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 14:15+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../../usage.md:2 a64aaaa1525e4e01b0ddcebc42c24bbd
+msgid "使用教程"
+msgstr "Quickstart Guide"
+
+#: ../../../usage.md:4 f1b40737fb584d889b82c7f55b652977
+msgid ""
+"启动一个 Demo "
+"模型训练，需要进行三项准备，**安装**，**数据集准备**和**模型训练配置**。接下来，首先会介绍数据准备相关的操作，再简要描述模型训练配置相关的内容。"
+msgstr ""
+"To start a demo model training, you need to prepare three things: "
+"**installation**, **dataset preparation**, and **model training "
+"configuration**. In this guide, we will first cover the steps for dataset"
+" preparation and then briefly describe the model training configuration."
+
+#: ../../../usage.md:6 b35abe307c2f4d23866fff828308ebf2
+msgid "安装"
+msgstr "Installation"
+
+#: ../../../usage.md:7 64a8c1f5f71c45519e636aa7edba10bc
+msgid "请参考[安装文档](./install.md)进行安装。"
+msgstr ""
+"Please refer to the [installation guide](./install.md) for instructions "
+"on how to install the necessary dependencies."
+
+#: ../../../usage.md:9 bd96714d12ee415794dea5a4578bd8cd
+msgid "数据准备 （预训练）"
+msgstr "Dataset Preparation (Pre-training)"
+
+#: ../../../usage.md:11 5a0b39fb9da94e96b87db40d1f231a0c
+msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。"
+msgstr ""
+"The dataset for the InternLM training task includes a series of `bin` and"
+" `meta` files. A `tokenizer` is used to generate the training dataset "
+"from the original text files. The tokenizer model is imported by "
+"specifying the model parameter path in `tools/tokenizer.py`. Currently, "
+"`V7_sft.model` is provided to generate tokens. If you want to use a "
+"different model, you can directly modify the model parameter path in "
+"`tokenizer.py`."
+
+#: ../../../usage.md:13 3cef8126b8784af48d81cc140322909e
+msgid "可以运行以下命令生成原始数据对应的`bin`和`meta`文件，其中参数`text_input_path`表示原始文本数据路径，目前支持`txt`、`json`和`jsonl`三种输入格式，`bin_output_path`表示生成的`bin`文件的保存路径。"
+msgstr ""
+"You can run the following command to generate `bin` and `meta` files "
+"corresponding to the original data. The parameter `text_input_path` "
+"represents the path of the original text data, currently supporting "
+"`txt`, `json`, and `jsonl` formats, while `bin_output_path` represents "
+"the save path of the generated `bin` files."
+
+#: ../../../usage.md:18 107ff2280da14cb6a27f4e9857186333
+msgid "下面是一个数据处理的例子："
+msgstr "Here is an example of data processing:"
+
+#: ../../../usage.md:20 c11a9860263c4e2288a561f3435fa706
+msgid "给定一个包含原始数据集的文件`raw_data.txt`，原始数据集如下所示："
+msgstr ""
+"Given a file `raw_data.txt` containing the raw dataset, the raw dataset "
+"is shown below:"
+
+#: ../../../usage.md:27 4012599b42ab47bd979d2a0b79ca1147
+msgid "可以通过运行以下命令来生成`bin`和`meta`文件："
+msgstr ""
+"You can generate the `bin` and `meta` files by running the following "
+"command:"
+
+#: ../../../usage.md:32 cca91b6cf53a4082932dd34ea4b7f954
+msgid "需要注意的是，生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这六个目录下，以区分数据集的类型。"
+msgstr ""
+"It should be noted that the generated `bin` files need to be saved in one"
+" of the following directories: `cn`, `en`, `code`, `ja`, `ar`, or "
+"`kaoshi`, depending on the type of dataset."
+
+#: ../../../usage.md:34 417312ca1e35479e811953f777e3565a
+msgid "其中，`cn`表示中文数据集；`en`表示英文数据集；`code`表示代码数据集；`ja`表示日语数据集；`ar`表示阿拉伯语数据集；`kaoshi`表示考试数据集。"
+msgstr ""
+"Here, `cn` represents the Chinese dataset, `en` represents the English "
+"dataset, `code` represents the code dataset, `ja` represents the Japanese"
+" dataset, `ar` represents the Arabic dataset, and `kaoshi` represents the"
+" exam dataset."
+
+#: ../../../usage.md:36 79c21f8e89b34499ba4e25e20593ec28
+msgid "生成的bin文件的格式如下："
+msgstr "The format of the generated `bin` files is as follows:"
+
+#: ../../../usage.md:42 26388d996c4e4116bc216be9bc007f62
+msgid "`bin`文件中的每一行均对应原始数据集中的每一个句子，表示每个句子的`token`（下文将用sequence指定）。"
+msgstr ""
+"Each line in the `bin` file corresponds to each sentence in the original "
+"dataset, representing the tokens of each sentence (referred to as "
+"sequence below)."
+
+#: ../../../usage.md:44 b39148a85ee64a349975d26282fbe59b
+msgid "生成的`meta`文件的格式如下："
+msgstr "The format of the generated `meta` file is as follows:"
+
+#: ../../../usage.md:48 175a6007197a40568535f945672e5df2
+msgid ""
+"在`meta`文件中，每个元组对应着`bin`文件中每一个`sequence`的元信息。其中，元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting"
+" index`，第二个元素表示每个`sequence`中有多少个`tokens`。"
+msgstr ""
+"Each tuple in the `meta` file represents the meta information of each "
+"`sequence`, where the first element in the tuple indicates the `starting "
+"index` of each `sequence` among all `sequences`, and the second element "
+"indicates the number of `tokens` for each `sequence`."
+
+#: ../../../usage.md:50 46874a3de3924837979f9949f1237e39
+msgid ""
+"例如，对于第一个`sequence`，`starting index`为 0，有 11 "
+"个`tokens`；对于第二个`sequence`，由于第一个`sequence`转换为`string`后的长度为`89`，因此它的`starting"
+" index`为 90，有 15 个`tokens`。"
+msgstr ""
+"For example, the first `sequence` starts at index 0 and has 16 `tokens`. "
+"The second `sequence` starts at index 110 and has 24 `tokens`."
+
+#: ../../../usage.md:52 25ea049fa411408b8856e7aa657835ab
+msgid "`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致，此处不再赘叙。"
+msgstr ""
+"The `bin` and `meta` file formats for `json` and `jsonl` type files are "
+"the same as for `txt`, so we won't go over them here."
+
+#: ../../../usage.md:54 bc52f959cb57494483a181e843014ed1
+msgid "数据准备 （微调）"
+msgstr "Data Preparation (Fine-tuning)"
+
+#: ../../../usage.md:56 73c74620c2994486acc747ba0c7f0b46
+msgid ""
+"微调任务的数据集格式与预训练任务保持一致，生成的数据格式为一系列的`bin`和`meta`文件。以下以 Alpaca "
+"数据集为例，介绍微调的数据准备流程。"
+msgstr ""
+"The data format for fine-tuning tasks is the same as for pre-training "
+"tasks, which consists of a series of `bin` and `meta` files. Let's take "
+"the Alpaca dataset as an example to explain the data preparation process "
+"for fine-tuning."
+
+#: ../../../usage.md:58 75f0e22d10ca413389ec8b947ae6141f
+msgid ""
+"下载 [Alpaca 数据集](https://github.com/tatsu-"
+"lab/stanford_alpaca/blob/main/alpaca_data.json)"
+msgstr ""
+"Download the [Alpaca dataset](https://github.com/tatsu-"
+"lab/stanford_alpaca/blob/main/alpaca_data.json)."
+
+#: ../../../usage.md:60 667606fcea454af48353a5b40f82fc46
+msgid "对 Alpaca 数据进行 tokenize，使用以下命令"
+msgstr "Tokenize the Alpaca dataset using the following command:"
+
+#: ../../../usage.md:66 60283b9237c8462ea37288b8ece79081
+msgid "建议用户参考 alpaca_tokenizer.py 编写新的脚本对自己的数据集进行 tokenize"
+msgstr ""
+"It is recommended that users refer to alpaca_tokenizer.py to write new "
+"scripts to tokenize their own datasets"
+
+#: ../../../usage.md:68 cdf45a4de9874e9fb65f7104dcee3c61
+msgid "训练配置"
+msgstr "Training Configuration"
+
+#: ../../../usage.md:70 7c42ebc23246450cbc1270e1461b16f6
+msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例，介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。"
+msgstr ""
+"Taking the configuration file `configs/7B_sft.py` for the 7B demo as an "
+"example, let's discuss the data, model, and parallel configurations "
+"required to start a model training."
+
+#: ../../../usage.md:72 247cfe98a7f44c2293aa2e2351f1ea69
+msgid "数据配置"
+msgstr "Data Configuration"
+
+#: ../../../usage.md:73 31327e7dce5848778db5361b3fbded1c
+msgid "数据相关的关键参数配置及释义如下所示："
+msgstr "Here are the key parameters and their explanations for data configuration:"
+
+#: ../../../usage.md:88 4d2608136fef4141bd6e47f78b8591b2
+msgid "![pack_into_one](./imgs/pack_into_one.png)"
+msgstr ""
+
+#: ../../../usage.md:88 c5acb028f2694712b2af788a864d5927
+msgid "pack_into_one"
+msgstr ""
+
+#: ../../../usage.md:91 db6b9ce8e8294952845893dd7aad098f
+msgid "目前支持传入数据集文件路径`train_folder`，且要求文件格式如下："
+msgstr ""
+"Currently, it supports passing the dataset file path `train_folder`, and "
+"the file format is required to be as follows:"
+
+#: ../../../usage.md:98 f22536fc3dfa4552a103a7cb57a20f92
+msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。"
+msgstr ""
+"For detailed information about the dataset, please refer to the \"Data "
+"Preparation\" section."
+
+#: ../../../usage.md:100 bc4f0b06e9c24730a7a831b7aca417e2
+msgid "模型配置"
+msgstr "Model Configuration"
+
+#: ../../../usage.md:102 ecf278a0a851496fae2e49c436e59368
+msgid "如果在启动训练时要加载模型 `checkpoint`，可进行如下相关配置："
+msgstr ""
+"If you want to load a model checkpoint when starting the training, you "
+"can configure it as follows:"
+
+#: ../../../usage.md:115 38244aba74294067a4019d0777621746
+msgid "注意："
+msgstr "Note:"
+
+#: ../../../usage.md:116 19d1eb0a797f4bd9a702a00e525d7753
+msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
+msgstr ""
+"`load_model_only_folder` and `load_ckpt_folder` cannot be set at the same"
+" time."
+
+#: ../../../usage.md:117 3ea27a1f6be044a3959890be69311b24
+msgid "路径若以 `local:` 为前缀，则存储在本地文件系统；若以 `boto3:` 为前缀，则存储在远程 oss 上"
+msgstr ""
+"If the path starts with `local:`, it means the file is stored in the "
+"local file system. If it starts with `boto3:`, it means the file is "
+"stored in the remote OSS."
+
+#: ../../../usage.md:119 1d6381b4cfff41d8bdd5347e8a135869
+msgid "模型相关关键参数配置如下所示："
+msgstr "The configuration for the model is as follows:"
+
+#: ../../../usage.md:143 1026791c9f054576857ef1930db6b167
+msgid "注意：用户可自定义模型类型名和模型结构，并配置相对应的模型参数。通过`utils/registry.py`下的`MODEL_INITIALIZER`对象进行模型初始化函数接口注册，在训练主函数`train.py`中初始化模型时，可通过`model_type`配置获取指定的模型初始化接口函数。"
+msgstr ""
+"Note: Users can customize the model type name and model structure, and "
+"configure the corresponding model parameters. The model initialization "
+"function interface can be registered through the `MODEL_INITIALIZER` "
+"object in `utils/registry.py`. When initializing the model in the "
+"training main function `train.py`, the specified model initialization "
+"interface function can be obtained through the `model_type` "
+"configuration."
+
+#: ../../../usage.md:145 34823bcbe7754190bc9747758c1aad0c
+msgid ""
+"*如果基于 InternLM 7B继续训练，可以参考 "
+"[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 "
+"OpenXLab 链接下载权重*"
+msgstr ""
+"*If you want to start training based on InternLM 7B, you can refer to "
+"OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-"
+"zoo) to download weights*."
+
+#: ../../../usage.md:147 4cabc928f8884cd38a6bb683b3bfade3
+msgid "并行配置"
+msgstr "Parallel Configuration"
+
+#: ../../../usage.md:149 f97ade07340340959345e73567bae793
+msgid "训练并行配置样例如下："
+msgstr "Training parallel configuration example:"
+
+#: ../../../usage.md:158 87fb5a4e4a4047ee8a9b8bb43915636d
+msgid "zero1：zero 并行策略，分如下三种情况，默认值为 -1"
+msgstr ""
+"zero1: zero parallel strategy, divided into the following three cases, "
+"default value is -1"
+
+#: ../../../usage.md:159 58dc08e2c52e4aaba99b4fbb6cf2e8b4
+#, fuzzy
+msgid "当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
+msgstr ""
+"When `zero1 <= 0`, the size of the zero1 process group is equal to the "
+"size of the data parallel process group, so the optimizer state "
+"parameters will be split within the data parallel range."
+
+#: ../../../usage.md:160 67e2ebd795d840b29fd1d684a068e90d
+#, fuzzy
+msgid "当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
+msgstr ""
+"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain "
+"the complete optimizer state parameters."
+
+#: ../../../usage.md:161 7caedfc943514b9b83090b858ef6d163
+#, fuzzy
+msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集"
+msgstr ""
+"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process"
+" group is a subset of the data parallel process group."
+
+#: ../../../usage.md:162 b38d3a1f72d543c6a44728fb6babea6b
+msgid "tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1"
+msgstr ""
+"tensor: tensor parallel size, usually the number of GPUs per node, "
+"default is 1"
+
+#: ../../../usage.md:163 237ac76df68f4a999396dad37c5495c3
+msgid "pipeline：流水线并行策略"
+msgstr "pipeline: pipeline parallel strategy"
+
+#: ../../../usage.md:164 c8c38f6ab2ea432eb9ebbb62618ca33e
+msgid "size：流水线并行大小，默认值为 1"
+msgstr "size: pipeline parallel size, the default value is 1"
+
+#: ../../../usage.md:165 b9158818e72e49acbdd52ad317cb80df
+msgid "interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭"
+msgstr ""
+"interleaved_overlap: bool type, when interleaved scheduling, enable or "
+"disable communication optimization, the default value is False"
+
+#: ../../../usage.md:166 28e4d48661ff4f80aff788fdda604433
+msgid "sequence_parallel：是否开启序列化并行，默认值为 False"
+msgstr ""
+"sequence_parallel: Whether to enable sequence parallelism, the default "
+"value is False"
+
+#: ../../../usage.md:168 27528ab826824d2280506460e1f2f7bd
+msgid "注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`"
+msgstr ""
+"Note: `Data parallel size = Total number of GPUs / Pipeline parallel size"
+" / Tensor parallel size`"
+
+#: ../../../usage.md:170 5a7af23cec604f1d9096a5ab81993c87
+msgid "启动训练"
+msgstr "Start Training"
+
+#: ../../../usage.md:172 795e51542ed84cea83b63c5233bb88bc
+msgid "完成了以上数据集准备和相关训练配置后，可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例，介绍训练启动方式。"
+msgstr ""
+"After completing the data preparation and relevant training "
+"configurations mentioned above, you can start the demo training. The "
+"following examples demonstrate how to start the training in both slurm "
+"and torch environments."
+
+#: ../../../usage.md:174 96402cbe443044c0a0a1695c9847140b
+msgid "若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示："
+msgstr ""
+"If you want to start distributed training on slurm with 16 GPUs across "
+"multiple nodes, use the following command:"
+
+#: ../../../usage.md:179 c569e60401a6471eb9af2473acc4d5a6
+msgid "若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示："
+msgstr ""
+"If you want to start distributed training on torch with 8 GPUs on a "
+"single node, use the following command:"
+
+#: ../../../usage.md:184 a045a060d0734aab9d894aed553cef34
+msgid "运行结果"
+msgstr "Training Results"
+
+#: ../../../usage.md:186 c68e8dfa259647c7a6e6e0c0446b0b18
+msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例，训练结果日志展示如下："
+msgstr ""
+"Taking the configuration of the demo training on a single machine with 8 "
+"GPUs on slurm as an example, the training result log is shown below:"
+
--- a/doc/code-docs/make.bat
+++ b/doc/code-docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@ -0,0 +1,11 @@
+Sphinx
+sphinx-autobuild
+sphinx_rtd_theme
+sphinx_markdown_tables
+autodoc_pydantic==1.9
+enum_tools
+numpy
+torch
+tqdm
+pyecharts
+myst-parser
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@ -0,0 +1,11 @@
+模型保存
+===================
+
+InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。 其中，可以
+使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。
+
+Checkpointing
+-------------
+
+.. autoclass:: internlm.utils.model_checkpoint.CheckpointManager
+    :members:
--- a/doc/code-docs/source/conf.py
+++ b/doc/code-docs/source/conf.py
@ -0,0 +1,103 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+import sys
+
+project = "InternLM"
+copyright = "2023, InternLM Team"
+author = "InternLM Team"
+
+with open("../../../version.txt", "r") as f:
+    release = f.readline().rstrip()
+
+master_doc = "index"
+
+autodoc_member_order = "bysource"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx_rtd_theme",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autodoc",
+    "sphinxcontrib.autodoc_pydantic",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.napoleon",
+    "myst_parser",
+]
+
+pygments_style = "sphinx"
+
+# autodoc_pyandtic config
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_field_signature_prefix = " "
+autodoc_pydantic_model_signature_prefix = "class"
+autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_config_member = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_summary_list_order = "bysource"
+autodoc_pydantic_model_member_order = "bysource"
+autodoc_pydantic_field_list_validators = False
+
+# Napoleon settings
+napoleon_google_docstring = True
+napoleon_numpy_docstring = True
+napoleon_include_init_with_doc = False
+napoleon_include_private_with_doc = False
+napoleon_include_special_with_doc = True
+napoleon_use_admonition_for_examples = False
+napoleon_use_admonition_for_notes = False
+napoleon_use_admonition_for_references = False
+napoleon_use_ivar = False
+napoleon_use_param = True
+napoleon_use_rtype = True
+napoleon_preprocess_types = False
+napoleon_type_aliases = None
+napoleon_attr_annotations = True
+
+templates_path = ["_templates"]
+
+exclude_patterns = []
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
+
+# GitHub integration
+html_context = {
+    "display_github": True,
+    "github_user": "InternLM",
+    "github_repo": "InternLM",
+    "github_version": "master",
+    "conf_py_path": "/doc/code-docs/source/",
+}
+
+sys.path.insert(0, os.path.abspath("../../../"))
+
+# Prepend module names to class descriptions
+add_module_names = True
+
+autoclass_content = "class"
+
+autodoc_mock_imports = [
+    "apex",
+    "torch",
+    "numpy",
+]
+
+# support multi-language docs
+language = "zh_CN"
+locale_dirs = ["../locales/"]  # path is example but recommended.
+gettext_compact = False  # optional.
+gettext_uuid = False  # optional.
--- a/doc/code-docs/source/example/30B_demo.rst
+++ b/doc/code-docs/source/example/30B_demo.rst
@ -0,0 +1,202 @@
+30B Demo
+================
+
+训练配置
+----------------
+
+30B demo 训练配置文件样例如下:
+
+.. code-block:: python
+
+    JOB_NAME = "30b_train"
+
+    SEQ_LEN = 2048
+    HIDDEN_SIZE = 6144
+    NUM_ATTENTION_HEAD = 48
+    MLP_RATIO = 8 / 3
+    NUM_LAYER = 60
+    VOCAB_SIZE = 103168
+
+    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+    # Ckpt folder format:
+    # fs: 'local:/mnt/nfs/XXX'
+    SAVE_CKPT_FOLDER = "local:llm_ckpts"
+    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+    # boto3 Ckpt folder format:
+    # import os
+    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+    CHECKPOINT_EVERY = 50
+    ckpt = dict(
+        enable_save_ckpt=False,  # enable ckpt save.
+        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
+        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
+        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+        checkpoint_every=CHECKPOINT_EVERY,
+        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
+        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    )
+
+    TRAIN_FOLDER = "/path/to/dataset"
+    VALID_FOLDER = "/path/to/dataset"
+    data = dict(
+        seq_len=SEQ_LEN,
+        # micro_num means the number of micro_batch contained in one gradient update
+        micro_num=4,
+        # packed_length = micro_bsz * SEQ_LEN
+        micro_bsz=2,
+        # defaults to the value of micro_num
+        valid_micro_num=4,
+        # defaults to 0, means disable evaluate
+        valid_every=50,
+        pack_sample_into_one=False,
+        total_steps=50000,
+        skip_batches="",
+        rampup_batch_size="",
+        # Datasets with less than 50 rows will be discarded
+        min_length=50,
+        # train_folder=TRAIN_FOLDER,
+        # valid_folder=VALID_FOLDER,
+    )
+
+    grad_scaler = dict(
+        fp16=dict(
+            # the initial loss scale, defaults to 2**16
+            initial_scale=2**16,
+            # the minimum loss scale, defaults to None
+            min_scale=1,
+            # the number of steps to increase loss scale when no overflow occurs
+            growth_interval=1000,
+        ),
+        # the multiplication factor for increasing loss scale, defaults to 2
+        growth_factor=2,
+        # the multiplication factor for decreasing loss scale, defaults to 0.5
+        backoff_factor=0.5,
+        # the maximum loss scale, defaults to None
+        max_scale=2**24,
+        # the number of overflows before decreasing loss scale, defaults to 2
+        hysteresis=2,
+    )
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+    loss = dict(
+        label_smoothing=0,
+    )
+
+    adam = dict(
+        lr=1e-4,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_beta2_c=0,
+        adam_eps=1e-8,
+        weight_decay=0.01,
+    )
+
+    lr_scheduler = dict(
+        total_steps=data["total_steps"],
+        init_steps=0,  # optimizer_warmup_step
+        warmup_ratio=0.01,
+        eta_min=1e-5,
+        last_epoch=-1,
+    )
+
+    beta2_scheduler = dict(
+        init_beta2=adam["adam_beta2"],
+        c=adam["adam_beta2_c"],
+        cur_iter=-1,
+    )
+
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+    """
+    zero1 parallel:
+        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    pipeline parallel (dict):
+        1. size: int, the size of pipeline parallel.
+        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+    tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    """
+    parallel = dict(
+        zero1=-1,
+        tensor=4,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+    cudnn_deterministic = False
+    cudnn_benchmark = False
+
+
+启动训练
+----------------
+
+完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动两节点 16GPU 的训练命令如下所示：
+
+.. code-block:: bash
+
+    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/30B_sft.py
+
+训练结果
+----------------
+
+基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下：
+
+.. code-block:: bash
+
+    2023-09-06 10:29:26,629 INFO parallel_context.py:508 in set_device -- process rank 10 is bound to host:HOST-10-140-66-20 device: 2
+    2023-09-06 10:29:26,632 INFO parallel_context.py:508 in set_device -- process rank 11 is bound to host:HOST-10-140-66-20 device: 3
+    2023-09-06 10:29:26,634 INFO parallel_context.py:508 in set_device -- process rank 12 is bound to host:HOST-10-140-66-20 device: 4
+    2023-09-06 10:29:26,636 INFO parallel_context.py:508 in set_device -- process rank 9 is bound to host:HOST-10-140-66-20 device: 1
+    2023-09-06 10:29:26,640 INFO parallel_context.py:508 in set_device -- process rank 15 is bound to host:HOST-10-140-66-20 device: 7
+    2023-09-06 10:29:26,639 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:HOST-10-140-66-9 device: 0
+    2023-09-06 10:29:26,641 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:HOST-10-140-66-9 device: 2
+    2023-09-06 10:29:26,643 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:HOST-10-140-66-9 device: 5
+    2023-09-06 10:29:26,645 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:HOST-10-140-66-9 device: 6
+    2023-09-06 10:29:26,661 INFO parallel_context.py:508 in set_device -- process rank 13 is bound to host:HOST-10-140-66-20 device: 5
+    2023-09-06 10:29:26,707 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:HOST-10-140-66-9 device: 1
+    2023-09-06 10:29:26,826 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:HOST-10-140-66-9 device: 4
+    2023-09-06 10:29:26,871 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:HOST-10-140-66-9 device: 7
+    2023-09-06 10:29:26,932 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:HOST-10-140-66-9 device: 3
+    2023-09-06 10:29:27,156 INFO parallel_context.py:508 in set_device -- process rank 14 is bound to host:HOST-10-140-66-20 device: 6
+    2023-09-06 10:29:27,271 INFO parallel_context.py:508 in set_device -- process rank 8 is bound to host:HOST-10-140-66-20 device: 0
+    2023-09-06 10:29:32,060 INFO launch.py:329 in launch -- Distributed environment is initialized, data parallel size: 4, pipeline parallel size: 1, tensor parallel size: 4
+    2023-09-06 10:30:06,141 INFO hybrid_zero_optim.py:291 in _partition_param_list -- Number of elements on ranks: [1782007296, 1812307968, 1812307968, 1706469888], rank:0
+    2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan 
+    2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan 
+    2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan
--- a/doc/code-docs/source/example/7B_demo.rst
+++ b/doc/code-docs/source/example/7B_demo.rst
@ -0,0 +1,192 @@
+7B Demo
+================
+
+训练配置
+----------------
+
+7B demo 的训练配置文件样例如下:
+
+.. code-block:: python
+
+    JOB_NAME = "7b_train"
+
+    SEQ_LEN = 2048
+    HIDDEN_SIZE = 4096
+    NUM_ATTENTION_HEAD = 32
+    MLP_RATIO = 8 / 3
+    NUM_LAYER = 32
+    VOCAB_SIZE = 103168
+
+    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+    # Ckpt folder format:
+    # fs: 'local:/mnt/nfs/XXX'
+    SAVE_CKPT_FOLDER = "local:llm_ckpts"
+    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+    # boto3 Ckpt folder format:
+    # import os
+    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+    CHECKPOINT_EVERY = 50
+    ckpt = dict(
+        enable_save_ckpt=False,  # enable ckpt save.
+        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
+        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
+        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+        checkpoint_every=CHECKPOINT_EVERY,
+        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
+        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    )
+
+    TRAIN_FOLDER = "/path/to/dataset"
+    VALID_FOLDER = "/path/to/dataset"
+    data = dict(
+        seq_len=SEQ_LEN,
+        # micro_num means the number of micro_batch contained in one gradient update
+        micro_num=4,
+        # packed_length = micro_bsz * SEQ_LEN
+        micro_bsz=2,
+        # defaults to the value of micro_num
+        valid_micro_num=4,
+        # defaults to 0, means disable evaluate
+        valid_every=50,
+        pack_sample_into_one=False,
+        total_steps=50000,
+        skip_batches="",
+        rampup_batch_size="",
+        # Datasets with less than 50 rows will be discarded
+        min_length=50,
+        # train_folder=TRAIN_FOLDER,
+        # valid_folder=VALID_FOLDER,
+    )
+
+    grad_scaler = dict(
+        fp16=dict(
+            # the initial loss scale, defaults to 2**16
+            initial_scale=2**16,
+            # the minimum loss scale, defaults to None
+            min_scale=1,
+            # the number of steps to increase loss scale when no overflow occurs
+            growth_interval=1000,
+        ),
+        # the multiplication factor for increasing loss scale, defaults to 2
+        growth_factor=2,
+        # the multiplication factor for decreasing loss scale, defaults to 0.5
+        backoff_factor=0.5,
+        # the maximum loss scale, defaults to None
+        max_scale=2**24,
+        # the number of overflows before decreasing loss scale, defaults to 2
+        hysteresis=2,
+    )
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+    loss = dict(
+        label_smoothing=0,
+    )
+
+    adam = dict(
+        lr=1e-4,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_beta2_c=0,
+        adam_eps=1e-8,
+        weight_decay=0.01,
+    )
+
+    lr_scheduler = dict(
+        total_steps=data["total_steps"],
+        init_steps=0,  # optimizer_warmup_step
+        warmup_ratio=0.01,
+        eta_min=1e-5,
+        last_epoch=-1,
+    )
+
+    beta2_scheduler = dict(
+        init_beta2=adam["adam_beta2"],
+        c=adam["adam_beta2_c"],
+        cur_iter=-1,
+    )
+
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+    """
+    zero1 parallel:
+        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    pipeline parallel (dict):
+        1. size: int, the size of pipeline parallel.
+        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+    tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    """
+    parallel = dict(
+        zero1=8,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+    cudnn_deterministic = False
+    cudnn_benchmark = False
+
+启动训练
+----------------
+
+完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动单节点 8GPU 的训练命令如下所示：
+
+.. code-block:: bash
+
+    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+
+训练结果
+----------------
+
+基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下：
+
+.. code-block:: bash
+
+    2023-09-05 11:47:44,649 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:SH-IDC1-10-140-1-110 device: 4
+    2023-09-05 11:47:44,650 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:SH-IDC1-10-140-1-110 device: 3
+    2023-09-05 11:47:44,651 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:SH-IDC1-10-140-1-110 device: 6
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:SH-IDC1-10-140-1-110 device: 7
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:SH-IDC1-10-140-1-110 device: 5
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:SH-IDC1-10-140-1-110 device: 1
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:SH-IDC1-10-140-1-110 device: 2
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:SH-IDC1-10-140-1-110 device: 0
+    2023-09-05 11:47:51,006 INFO launch.py:354 in launch -- Distributed environment is initialized, data parallel size: 8, pipeline parallel size: 1, tensor parallel size: 1
+    2023-09-05 11:49:09,855 INFO hybrid_zero_optim.py:294 in _partition_param_list -- Number of elements on ranks: [894509056, 944865280, 966909952, 966909952, 966909952, 944865280, 966909952, 670068736], rank:0
+    2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan 
--- a/doc/code-docs/source/example/index.rst
+++ b/doc/code-docs/source/example/index.rst
@ -0,0 +1,18 @@
+训练样例
+================
+
+7B Demo
+------------
+
+.. toctree::
+   :maxdepth: 2
+
+   7B_demo
+
+30B Demo
+------------
+
+.. toctree::
+   :maxdepth: 2
+
+   30B_demo
--- a/doc/code-docs/source/index.rst
+++ b/doc/code-docs/source/index.rst
@ -0,0 +1,95 @@
+.. InternLM documentation master file, created by
+   sphinx-quickstart on Mon Aug 28 17:33:28 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+
+InternLM
+========
+
+环境构建
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   install
+
+快速上手
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   usage
+
+训练构建
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   initialize
+
+训练 API
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   training
+
+并行训练
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   parallel
+
+模型备份
+--------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   checkpoint
+
+性能分析
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   profiler
+
+训练监控
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   monitor
+
+训练样例
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   example/index
+
+常见问题
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   qa
+
+索引和表格
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@ -0,0 +1,88 @@
+训练构建
+==============
+
+.. _InternLM-args:
+
+命令行参数解析
+----------------
+
+InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_ 库来向InternLM运行时提供命令行参数配置。用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器，其中包含一些内置参数，用户可以向此解析器添加自定义参数。
+
+.. code-block:: python
+
+    # Get InternLM default parser
+    parser = internlm.initialize.get_default_parser()
+    # Add new argument
+    parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.")
+    cmd_args = parser.parse_args()
+
+.. autofunction:: internlm.initialize.get_default_parser
+
+
+.. _InternLM-model-init:
+
+模型初始化
+-------------------------
+
+.. autofunction:: internlm.train.initialize_model
+
+InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下：
+
+.. code-block:: python
+
+    model_type = "INTERNLM"  # default is "INTERNLM", used to register classes and modules for model initialization
+    NUM_ATTENTION_HEAD = 32
+    VOCAB_SIZE = 103168
+    HIDDEN_SIZE = 4096
+    NUM_LAYER = 32
+    MLP_RATIO = 8 / 3
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+
+- 字段 ``model_type`` 指明了要初始化的模型类型
+- 字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置
+
+值得注意的是，用户可以定义新的模型类型，并使用装饰器 ``@MODEL_INITIALIZER.register_module`` 注册模型的初始化函数，其中 ``MODEL_INITIALIZER`` 是类 ``internlm.util.registry.Registry`` 的一个实例化对象，示例如下所示：
+
+.. code-block:: python
+
+    MODEL_TYPE = "NEW_MODEL"
+
+    @MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
+    def build_new_model_with_cfg(*args, **kwargs):
+
+.. _InternLM-optim-init:
+
+优化器初始化
+-------------------------
+
+.. autofunction:: internlm.train.initialize_optimizer
+
+.. _InternLM-dl-init:
+
+数据加载器初始化
+-------------------------
+
+.. autofunction:: internlm.train.get_train_data_loader
+
+.. _InternLM-trainer-init:
+
+Trainer 初始化
+-------------------------
+
+.. autofunction:: internlm.initialize.initialize_trainer
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@ -0,0 +1,2 @@
+```{include} ../../install.md
+```
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@ -0,0 +1,22 @@
+监控和告警
+=================
+
+监控
+-----------------
+
+InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` 来初始化上下文监控管理。其中，一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` 将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。
+
+.. autofunction:: internlm.monitor.monitor.initialize_monitor_manager
+
+.. autoclass:: internlm.monitor.monitor.MonitorManager
+    :members:
+
+.. autoclass:: internlm.monitor.monitor.MonitorTracker
+    :members:
+
+告警
+-----------------
+
+InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等，并捕获 SIGTERM 异常信号。当出现上述情况时，将触发警报，并通过调用 ``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook 地址发送报警消息。
+
+.. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@ -0,0 +1,152 @@
+并行训练
+==================
+
+.. Brief introduction to training parallelism, and how-to guide about config setting
+
+InternLM 支持张量并行、流水线并行、序列并行、数据并行和 ZeRO1.5 等并行化训练策略。在初始化分布式环境时，我们需要指定张量并行大小、流水线并行大小、数据并行大小以及 ZeRO1.5 策略。
+
+InternLM 的并行设置由配置文件中的 ``parallel`` 字段指定，用户可以通过修改配置文件 `config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_ 来更改并行配置。以下是一个并行训练配置示例：
+
+.. code-block:: python
+
+    parallel = dict(
+        zero1=8,
+        tensor=1,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+- zero1：zero 并行策略，分如下三种情况，默认值为 -1
+
+    - 当 ``zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+    - 当 ``zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+    - 当 ``zero1 > 1`` 且 ``zero1 <= data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集
+
+- tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
+- pipeline：流水线并行策略
+
+    - size：流水线并行大小，默认值为 1
+    - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为 False
+
+- sequence_parallel：是否开启序列化并行，默认值为 False
+
+注意：数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小
+
+张量并行
+-----------------
+
+InternLM 的张量并行实现方案基于 `flash attention <https://github.com/Dao-AILab/flash-attention>`_, 主要对 `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ 和
+`linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ 这两个模块进行张量并行操作。
+
+用户可通过配置文件中的 ``parallel.tensor`` 字段来设置张量并行大小。
+
+.. figure:: ../../imgs/tensor_parallel.png
+  :scale: 50%
+  :class: with-border
+
+  张量并行，采用自 `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_
+
+流水线并行
+-----------------
+
+InternLM 在流水线并行中使用 `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ （1F1B，一次前向传递后跟一次反向传递）策略。对于 1F1B 策略，有两种实现方式：
+
+1. 非交错调度器，内存高效。
+2. 交错调度器，内存高效且时间高效（GPU空泡较少）。
+
+.. figure:: ../../imgs/pipeline_schedule.png
+  :scale: 45%
+  :class: with-border
+
+  1F1B 流水线并行调度器，采用自 `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_
+
+非交错式流水线调度
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+如果要使用非交错式调度, 需要设置 ``model.num_chunks = 1``。
+
+.. autoclass:: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
+    :members:
+
+交错式流水线调度
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+如果要使用交错式调度, 需要设置 ``model.num_chunks > 1``。
+
+.. autoclass:: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler
+    :members:
+
+值得注意的是，在使用交错式流水线调度器时可启用通信优化功能，即在 1F1B 阶段启用异步通信，以充分利用上行/下行带宽并实现通信与计算重叠。
+
+用户需要在配置文件中设置 ``parallel.pipeline.interleaved_overlap = True``。该功能启用后，将调用函数 ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap``，并创建 ``internlm.core.communication.AsynCommunicator`` 以管理异步通信。
+
+``1F1B-without-overlap`` 和 ``1F1B-with-overlap`` 的区别如下所示：
+
+.. code-block:: bash
+
+    # The 1F1B stage without overlap consists of the following steps:
+    1. Perform the forward pass.
+    2. Perform the backward pass.
+    3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration to the previous stage, and receive the forward and backward inputs for the next iteration.
+
+.. code-block:: bash
+
+    # The 1F1B stage with overlap consists of the following steps:
+    1. Perform the forward pass.
+    2. Check if the backward input is ready.
+    3. Send the forward output and receive the forward input for the next iteration.
+    4. Perform the backward pass.
+    5. Check if the forward input is ready.
+    6. Send the backward output and receive the backward input for the next iteration.
+
+
+序列并行
+-----------------
+
+序列并行是一种在不引入额外计算、通信和内存开销的情况下，减少层 ``layer_norm`` 和 ``dropout`` 操作中的激活值内存。InternLM 中的序列并行实现基于 `flash attention <https://github.com/Dao-AILab/flash-attention>`_。这个并行策略有助于降低模型的内存消耗，提高了模型在资源受限环境中的可扩展性。
+
+如果要启用序列并行, 用户需要设置 ``parallel.sequence_parallel = True``。
+
+.. figure:: ../../imgs/sequence_parallel.png
+  :scale: 50%
+  :class: with-border
+
+  序列并行, 采用自 flash-attention
+
+数据并行
+-----------------
+
+InternLM 支持数据并行。数据并行大小为:
+
+`Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
+
+ZeRO1.5
+-----------------
+
+ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zero1`` 启用了本地节点内的分片。这个方法有助于有效管理和分配模型参数和梯度，以减少内存使用并提高训练效率。
+
+1. 当 ``parallel.zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+2. 当 ``parallel.zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+3. 当 ``parallel.zero1 > 1`` 且 ``parallel.zero1 <= data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集
+
+此外，用户可以在配置文件中通过 ``hybrid_zero_optimizer`` 字段启用优化器的通信优化功能，设置桶大小，以及梯度剪裁等参数。这些设置有助于优化训练过程中的通信和计算效率，以及梯度的处理方式。
+
+.. code-block:: python
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,  
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+这里有两个值得关注的通信优化点：
+
+- overlap_sync_grad: 如果设置为 ``True``，则将训练的 ``backward pass`` 与梯度的 ``all-reduce`` 通信重叠
+- overlap_sync_param: 如果设置为 ``True``，则将参数的 ``broadcast`` 通信与下一步的 ``forward pass`` 进行重叠
+
+这些优化可以加速训练过程，提高训练效率。
+
+.. autoclass:: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer
+    :members:
--- a/doc/code-docs/source/profiler.rst
+++ b/doc/code-docs/source/profiler.rst
@ -0,0 +1,164 @@
+性能分析
+========
+
+.. Mainly about the usage of torch profiler and memory profiler
+
+Torch Profiler
+-----------------
+
+InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据，如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ ，输出的性能分析 trace 文件可以使用 `tensorboard <https://www.tensorflow.org>`_ 进行可视化。
+
+用户如果想使用这个 torch 性能分析工具，需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch 性能分析后，用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` 文件夹中看到性能分析结果。
+
+实际运行生成的 ``Torch Profiler`` 目录结构如下：
+
+.. code-block:: bash
+
+    # tree ./7b_train/Sep08_11-00-51/traces -L 2
+    ./7b_train/Sep08_11-00-51/traces/
+    └── rank0_dp0_tp0_pp0
+        └── SH-IDC1-10-140-1-78_238619.1694142354680.pt.trace.json
+
+其中， ``traces`` 可以通过 ``TensorBoard`` 可视化，运行命令
+
+.. code-block:: bash
+
+    # visualize traces with tensorboard and custom port
+    tensorboard --logdir rank0_dp0_tp0_pp0 --port 10088
+
+在打开的 ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` 页面可以看到Operator和GPU Kernel的性能分析时间线如下，更多的功能请参考 `torch profiler with tensorboard <https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard>`_
+
+.. figure:: ../../imgs/torch_profiler_trace.png
+  :scale: 45%
+  :class: with-border
+
+.. autofunction:: internlm.train.initialize_llm_profile
+
+Memory Profiler
+-----------------
+
+InternLM 提供了一个实用的内存分析工具 ``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` 来监控实际的 GPU 内存使用情况。在实现中，会对模型数据（包括模型参数、模型梯度和优化器状态）和非模型数据（包括激活值）分别进行详细的统计。
+
+要使用这个内存分析工具，用户需要在启动训练时传递 ``--profiling`` 参数以启用内存分析。完成内存分析后，用户可以在 ``memory_trace/rank{}_dp{}_tp{}`` 文件夹中找到特定 rank 对应的内存分析结果（包括不同时间点的内存使用日志和显示总体内存使用情况的太阳图表）。
+
+实际运行生成的 ``memory_trace`` 目录结构如下：
+
+.. code-block:: bash
+
+    # tree ./memory_trace -L 2
+    ./memory_trace
+    ├── rank0_dp0_tp0                              # Profiling results for a specific rank device
+    │   ├── activation_memory_sunburst.html        # Sunburst chart showing activation memory usage
+    │   ├── grads_memory_sunburst.html             # Sunburst chart showing gradient memory usage
+    │   ├── memory.log                             # Log of GPU memory usage at different time points
+    │   ├── os_memory_sunburst.html                # Sunburst chart showing optimizer state memory usage
+    │   ├── params_memory_sunburst.html            # Sunburst chart showing parameter memory usage
+    │   └── summary_sunburst.html                  # Sunburst chart showing overall memory usage
+    ├── rank1_dp1_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank2_dp2_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank3_dp3_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank4_dp4_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank5_dp5_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank6_dp6_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    └── rank7_dp7_tp0
+        ├── activation_memory_sunburst.html
+        ├── grads_memory_sunburst.html
+        ├── memory.log
+        ├── os_memory_sunburst.html
+        ├── params_memory_sunburst.html
+        └── summary_sunburst.html
+
+其中， ``memory.log`` 的内容示例如下：
+
+.. code-block:: bash
+
+    Memory State:
+    time: 37.56313228607178
+    ---summary---
+    total_memory: 55953.56 MB
+    params_memory: 13965.51 MB, grads_memory: 13965.51 MB, os_params_memory: 3461.52 MB, os_state_memory: 6923.03 MB, activation_memory: 17638.00 MB
+
+    Memory State:
+    time: 38.46969723701477
+    ---summary---
+    total_memory: 38315.56 MB
+    params_memory: 13965.51 MB, grads_memory: 13965.51 MB, os_params_memory: 3461.52 MB, os_state_memory: 6923.03 MB, activation_memory: 0.00 MB
+    ---Layout---
+    params_layout:
+    layer: param_mem, layer_mem: 0.00 MB, total_mem: 13965.51 MB
+    layer: param_mem.embedding, layer_mem: 0.00 MB, total_mem: 806.00 MB
+    layer: param_mem.embedding.weight, layer_mem: 806.00 MB, total_mem: 806.00 MB
+    layer: param_mem.blocks, layer_mem: 0.00 MB, total_mem: 12353.50 MB
+    layer: param_mem.blocks.0, layer_mem: 0.00 MB, total_mem: 386.05 MB
+    layer: param_mem.blocks.0.mixer, layer_mem: 0.00 MB, total_mem: 128.03 MB
+    layer: param_mem.blocks.0.mixer.Wqkv, layer_mem: 0.00 MB, total_mem: 96.02 MB
+    layer: param_mem.blocks.0.mixer.Wqkv.weight, layer_mem: 96.00 MB, total_mem: 96.00 MB
+    layer: param_mem.blocks.0.mixer.Wqkv.bias, layer_mem: 0.02 MB, total_mem: 0.02 MB
+    layer: param_mem.blocks.0.mixer.out_proj, layer_mem: 0.00 MB, total_mem: 32.01 MB
+    layer: param_mem.blocks.0.mixer.out_proj.weight, layer_mem: 32.00 MB, total_mem: 32.00 MB
+    layer: param_mem.blocks.0.mixer.out_proj.bias, layer_mem: 0.01 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm1, layer_mem: 0.00 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm1.weight, layer_mem: 0.01 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm2, layer_mem: 0.00 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm2.weight, layer_mem: 0.01 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.mlp, layer_mem: 0.00 MB, total_mem: 258.00 MB
+    layer: param_mem.blocks.0.mlp.w1, layer_mem: 0.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w1.weight, layer_mem: 86.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w2, layer_mem: 0.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w2.weight, layer_mem: 86.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w3, layer_mem: 0.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w3.weight, layer_mem: 86.00 MB, total_mem: 86.00 MB
+    ......
+    grads_layout:
+    ......
+    os_params_layout:
+    ......
+    os_state_layout:
+    ......
+    activation_base_layout:
+    ......
+
+模型参数的太阳图示例如下：
+
+.. figure:: ../../imgs/params_memory_sunburst.png
+  :scale: 50%
+  :class: with-border
+
+.. autoclass:: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
+    :members:
--- a/doc/code-docs/source/qa.rst
+++ b/doc/code-docs/source/qa.rst
@ -0,0 +1,2 @@
+问&答
+====
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@ -0,0 +1,9 @@
+训练 API
+============
+
+InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` 管理。在定义了训练引擎和调度器之后，我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。
+
+有关详细用法，请参阅 Trainer API 文档和示例。
+
+.. autoclass:: internlm.core.trainer.Trainer
+    :members:
--- a/doc/code-docs/source/usage.md
+++ b/doc/code-docs/source/usage.md
@ -0,0 +1,4 @@
+```{include} ../../usage.md
+:relative-docs: docs/
+:relative-images:
+```
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -1,4 +1,4 @@
-## InternLM Installation
+## Installation

 ### Environment Preparation
 The required packages and corresponding version are shown as follows:
@ -59,12 +59,28 @@ cd ../../
 ```

 ### Environment Image
-Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
+Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm.
+
+#### Image Configuration and Build
+The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
+``` bash
+make -f docker.Makefile BASE_OS=centos7
+``` 
+In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
+
+#### Pull Standard Image
+The standard image based on ubuntu and centos has been built and can be directly pulled:

 ```bash
-# pull image
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# start container
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+# ubuntu20.04
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
+# centos7
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
 ```
+
+#### Run Container
+For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
+```
+The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md).
--- a/doc/en/structure.md
+++ b/doc/en/structure.md
@ -6,11 +6,14 @@ The system code file structure is shown below:
 ├── internlm                                 # Main directory of the system code
 │   ├── apis                                 # Interface module, containing some interface functions related to inference, etc.
 │   ├── core                                 # Core module, managing parallel context and training scheduling engine for training and inference
+│   │   ├── communication                    # Communication module, responsible for p2p communication in pipeline parallel scheduling
 │   │   ├── context                          # Context module, mainly responsible for initializing parallel process groups and managing parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
+│   │   ├── scheduler                        # Scheduling module, which manages schedulers for parallel training, including non-pipeline and pipeline parallel schedulers
+│   │   │   ├── no_pipeline_scheduler.py
+│   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # Responsible for managing the training and evaluation process of the model
-│   │   ├── no_pipeline_scheduler.py         # Scheduler for parallel training
 │   │   └── trainer.py                       # Responsible for managing the training engine and scheduler
 │   ├── data                                 # Data module, responsible for managing dataset generation and processing
 │   ├── initialize                           # Initialization module, responsible for managing distributed environment startup and trainer initialization
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@ -1,4 +1,4 @@
-## Pre-training and Fine-tuning Tutorial for InternLM
+## Quickstart Guide for Pre-training and Fine-tuning

 To start a demo model training, you need to prepare three things: **installation**, **dataset preparation**, and **model training configuration**. In this guide, we will first cover the steps for dataset preparation and then briefly describe the model training configuration.

@ -93,10 +93,7 @@ data = dict(
 )
 ```

-<div align="left">
-    <img src="../imgs/pack_into_one.png" width="550"/>
-</div>
-
+![pack_into_one](../imgs/pack_into_one.png)

 Currently, it supports passing the dataset file path `train_folder`, and the file format is required to be as follows:

@ -115,19 +112,19 @@ If you want to load a model checkpoint when starting the training, you can confi

 ```python
 SAVE_CKPT_FOLDER = "local:/path/to/save/ckpt"
-MODEL_ONLY_FOLDER = "local:/path/to/load/init/model/ckpt"
 LOAD_CKPT_FOLDER = "local:/path/to/load/resume/ckpt"
 ckpt = dict(
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save the model and optimizer checkpoints
    checkpoint_every=float("inf"),  # Save a checkpoint every specified number of steps, default value is inf
-    load_model_only_folder=MODEL_ONLY_FOLDER,  # Path to load the initial model weights, only load model weights without loading optimizer weights, training will start from the first step
-    load_ckpt_folder=LOAD_CKPT_FOLDER,  # Path to load the weights of the model and optimizer for resuming training, training will resume from the specified step
-    load_optimizer=True,  # Whether to load optimizer weights when resuming training, default value is True
+    # When resuming training from a breakpoint,:
+    # (1) 'path' is the path of the loaded checkpoint.
+    # (2) 'content' indicates which state will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # (3) 'ckpt_type' indicates which type ckpt will be loaded, currently supported: "internlm"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
 )
 ```

 Note:
- `load_model_only_folder` and `load_ckpt_folder` cannot be set at the same time.
 - If the path starts with `local:`, it means the file is stored in the local file system. If it starts with `boto3:`, it means the file is stored in the remote OSS.

 The configuration for the model is as follows:
@ -165,17 +162,21 @@ Training parallel configuration example:
 ```python
 parallel = dict(
    zero1=8,
-    pipeline=1,
    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )
 ```

 - zero1: zero parallel strategy, divided into the following three cases, default value is -1
-  - When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
-  - When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
-  - When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
- pipeline: pipeline parallel size, default value is 1
- tensor: tensor parallel size, usually the number of GPUs per node, default value is 1
+  - When `zero1 <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
+  - When `zero1 == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
+  - When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
+- tensor: tensor parallel size, usually the number of GPUs per node, default is 1
+- pipeline: pipeline parallel strategy
+   - size: pipeline parallel size, the default value is 1
+   - interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
+- sequence_parallel: Whether to enable sequence parallelism, the default value is False

 Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`

--- a/doc/imgs/params_memory_sunburst.png
+++ b/doc/imgs/params_memory_sunburst.png
--- a/doc/imgs/pipeline_schedule.png
+++ b/doc/imgs/pipeline_schedule.png
--- a/doc/imgs/sequence_parallel.png
+++ b/doc/imgs/sequence_parallel.png
--- a/doc/imgs/tensor_parallel.png
+++ b/doc/imgs/tensor_parallel.png
--- a/doc/imgs/torch_profiler_trace.png
+++ b/doc/imgs/torch_profiler_trace.png
--- a/doc/install.md
+++ b/doc/install.md
@ -1,4 +1,4 @@
-## InternLM项目的依赖安装
+## 环境安装

 ### 环境准备
 首先，需要安装的依赖包及对应版本列表如下：
@ -59,11 +59,28 @@ cd ../../
 ```

 ### 环境镜像
-用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像，拉取镜像及启动容器的命令如下：
-```bash
-# 拉取镜像
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# 启动容器
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。
+
+#### 镜像配置及构造
+dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build 镜像：
+``` bash
+make -f docker.Makefile BASE_OS=centos7
 ``` 
+在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
+
+#### 镜像拉取
+基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用：
+
+```bash
+# ubuntu20.04
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
+# centos7
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
+```
+
+#### 容器启动
+对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
+```
+容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。
--- a/doc/structure.md
+++ b/doc/structure.md
@ -6,11 +6,14 @@
 ├── internlm                                 # 系统代码的主目录
 │   ├── apis                                 # 接口模块，包含一些关于推理等的接口函数
 │   ├── core                                 # 核心模块，管理用于训练和推理的 parallel context 和训练调度引擎
+│   │   ├── communication                    # 通信模块，负责流水线并行调度中的p2p通信
 │   │   ├── context                          # context 模块，主要负责初始化并行进程组，并管理 parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
+│   │   ├── scheduler                        # 调度模块，管理并行训练的调度器，包括非流水线并行调度器和流水线并行调度器
+│   │   │   ├── no_pipeline_scheduler.py
+│   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # 负责管理模型的训练和评估过程
-│   │   ├── no_pipeline_scheduler.py         # 并行训练的调度器
 │   │   └── trainer.py                       # 负责管理训练引擎和调度器
 │   ├── data                                 # 数据模块，负责管理数据集生成和处理
 │   ├── initialize                           # 初始化模块，负责管理分布式环境启动和训练器初始化
--- a/doc/usage.md
+++ b/doc/usage.md
@ -1,4 +1,4 @@
-## 基于InternLM的预训练与微调使用教程
+## 使用教程

 启动一个 Demo 模型训练，需要进行三项准备，**安装**，**数据集准备**和**模型训练配置**。接下来，首先会介绍数据准备相关的操作，再简要描述模型训练配置相关的内容。

@ -84,9 +84,7 @@ data = dict(
 )
 ```

-<div align="left">
-    <img src="./imgs/pack_into_one.png" width="550"/>
-</div>
+![pack_into_one](./imgs/pack_into_one.png)


 目前支持传入数据集文件路径`train_folder`，且要求文件格式如下：
@ -103,18 +101,17 @@ data = dict(
 如果在启动训练时要加载模型 `checkpoint`，可进行如下相关配置：
 ```python
 SAVE_CKPT_FOLDER = "local:/path/to/save/ckpt"
-MODEL_ONLY_FOLDER = "local:/path/to/load/init/model/ckpt"
 LOAD_CKPT_FOLDER = "local:/path/to/load/resume/ckpt"
 ckpt = dict(
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # 存储模型和优化器 checkpoint 的路径
    checkpoint_every=float("inf"),  # 每多少个 step 存储一次 checkpoint，默认值为 inf
-    load_model_only_folder=MODEL_ONLY_FOLDER,  # 加载模型初始权重的路径，只加载模型权重，不加载优化器权重，训练将从第一个 step 开始
-    load_ckpt_folder=LOAD_CKPT_FOLDER,  # 断点续训时，加载模型和优化器等权重的路径，将从指定的 step 恢复训练
-    load_optimizer=True,  # 断点续训时，是否需要加载优化器权重，默认值为 True
+    # 断点续训时，加载模型和优化器等权重的路径，将从指定的 step 恢复训练
+    # content 表示哪些状态会被加载，支持： "model", "sampler", "optimizer", "scheduler", "all"
+    # ckpt_type 表示加载的模型类型，目前支持: "internlm"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
 )
 ```
 注意：
- `load_model_only_folder`与`load_ckpt_folder`不能同时设置
 - 路径若以 `local:` 为前缀，则存储在本地文件系统；若以 `boto3:` 为前缀，则存储在远程 oss 上

 模型相关关键参数配置如下所示：
@ -151,16 +148,20 @@ model = dict(
 ```python
 parallel = dict(
    zero1=8,
-    pipeline=1,
    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )
 ```
 - zero1：zero 并行策略，分如下三种情况，默认值为 -1
-  - 当`size <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
-  - 当`size == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
-  - 当`size > 1`且`size <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
- pipeline：流水线并行大小，默认值为 1
+  - 当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+  - 当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+  - 当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
 - tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
+- pipeline：流水线并行策略
+  - size：流水线并行大小，默认值为 1
+  - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭
+- sequence_parallel：是否开启序列化并行，默认值为 False

 注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`

--- a/docker.Makefile
+++ b/docker.Makefile
@ -0,0 +1,107 @@
+DOCKER_REGISTRY          ?= docker.io
+DOCKER_ORG               ?= my
+DOCKER_IMAGE             ?= internlm
+DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
+
+CUDA_VERSION              = 11.7.1
+GCC_VERSION               = 10.2.0
+
+CUDNN_VERSION             = 8
+BASE_RUNTIME              =
+# ubuntu20.04  centos7
+BASE_OS                   = centos7
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-${BASE_OS}
+# The conda channel to use to install cudatoolkit
+CUDA_CHANNEL              = nvidia
+# The conda channel to use to install pytorch / torchvision
+INSTALL_CHANNEL          ?= pytorch
+
+PYTHON_VERSION           ?= 3.10
+PYTORCH_VERSION          ?= 1.13.1
+TORCHVISION_VERSION      ?= 0.14.1
+TORCHAUDIO_VERSION       ?= 0.13.1
+BUILD_PROGRESS           ?= auto
+TRITON_VERSION           ?=
+GMP_VERSION              ?= 6.2.1
+MPFR_VERSION             ?= 4.1.0
+MPC_VERSION              ?= 1.2.1
+GCC_VERSION              ?= 10.2.0
+HTTPS_PROXY_I            ?=
+HTTP_PROXY_I             ?=
+FLASH_ATTEN_VERSION      ?= 1.0.5
+FLASH_ATTEN_TAG          ?= v${FLASH_ATTEN_VERSION}
+
+BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
+                            --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
+                            --build-arg CUDA_VERSION=$(CUDA_VERSION) \
+                            --build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
+                            --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
+                            --build-arg TORCHVISION_VERSION=$(TORCHVISION_VERSION) \
+                            --build-arg TORCHAUDIO_VERSION=$(TORCHAUDIO_VERSION) \
+                            --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
+                            --build-arg TRITON_VERSION=$(TRITON_VERSION) \
+                            --build-arg GMP_VERSION=$(GMP_VERSION) \
+                            --build-arg MPFR_VERSION=$(MPFR_VERSION) \
+                            --build-arg MPC_VERSION=$(MPC_VERSION) \
+                            --build-arg GCC_VERSION=$(GCC_VERSION) \
+                            --build-arg https_proxy=$(HTTPS_PROXY_I) \
+                            --build-arg http_proxy=$(HTTP_PROXY_I) \
+                            --build-arg FLASH_ATTEN_TAG=$(FLASH_ATTEN_TAG)
+
+EXTRA_DOCKER_BUILD_FLAGS ?=
+
+BUILD                    ?= build
+# Intentionally left blank
+PLATFORMS_FLAG           ?=
+PUSH_FLAG                ?=
+USE_BUILDX               ?=1
+BUILD_PLATFORMS          ?=
+WITH_PUSH                ?= false
+BUILD_TYPE               ?= intrenlm-dev
+
+# Setup buildx flags
+ifneq ("$(USE_BUILDX)","")
+BUILD                     =  buildx build
+ifneq ("$(BUILD_PLATFORMS)","")
+PLATFORMS_FLAG            = --platform="$(BUILD_PLATFORMS)"
+endif
+endif
+# endif
+
+# # Only set platforms flags if using buildx
+# ifeq ("$(WITH_PUSH)","true")
+# PUSH_FLAG               = --push
+# endif
+# endif
+
+ifeq ($(findstring centos,$(BASE_OS)),centos)
+    DOCKERFILE_PATH ?= ./docker/Dockerfile-centos
+else
+    DOCKERFILE_PATH ?= ./docker/Dockerfile-ubuntu
+endif
+
+#use -f to specify dockerfile
+DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
+                            docker $(BUILD) \
+                                   --progress=$(BUILD_PROGRESS) \
+                                   $(EXTRA_DOCKER_BUILD_FLAGS) \
+                                   $(PLATFORMS_FLAG) \
+                                   $(PUSH_FLAG) \
+                                   -f $(DOCKERFILE_PATH) \
+                                   -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
+                                   $(BUILD_ARGS) .
+
+                                   # --target $(BUILD_TYPE)
+
+.PHONY: all
+all: devel-image
+
+.PHONY: devel-image
+devel-image: BASE_IMAGE := $(BASE_DEVEL)
+devel-image: DOCKER_TAG := torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}-flashatten${FLASH_ATTEN_VERSION}-${BASE_OS}
+devel-image:
+	$(DOCKER_BUILD)
+
+.PHONY: clean
+clean:
+	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
--- a/docker/Dockerfile-centos
+++ b/docker/Dockerfile-centos
@ -0,0 +1,131 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on centos
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN yum install deltarpm -y && yum update -y \
+    && yum install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        bzip2 \
+        gcc \
+        gcc-c++ \
+        file \
+        texinfo \
+        which
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && git clone https://github.com/ninja-build/ninja.git \
+    && cd ninja \
+    && git checkout release \
+    && ./configure.py --bootstrap \
+    && mv ./ninja /usr/bin \
+    && cd ..
+
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+RUN git submodule update --init --recursive \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
+    && cd /InternLM/third_party/flash-attention \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/docker/Dockerfile-ubuntu
+++ b/docker/Dockerfile-ubuntu
@ -0,0 +1,112 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on ubuntu
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        ninja-build
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+RUN git submodule update --init --recursive \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
+    && cd /InternLM/third_party/flash-attention \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-centos
+++ b/experiment/Dockerfile-centos
@ -0,0 +1,161 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on centos
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN yum install deltarpm -y && yum update -y \
+    && yum install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        bzip2 \
+        gcc \
+        gcc-c++ \
+        file \
+        texinfo \
+        which
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && git clone https://github.com/ninja-build/ninja.git \
+    && cd ninja \
+    && git checkout release \
+    && ./configure.py --bootstrap \
+    && mv ./ninja /usr/bin \
+    && cd ..
+
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG PYTORCH_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHAUDIO_VERSION
+
+RUN /opt/conda/bin/pip --no-cache-dir install \
+    transformers==4.29.2 \
+    sentencepiece \
+    numpy \
+    tqdm \
+    psutil \
+    packaging \
+    pre-commit \
+    ninja \
+    gputil \
+    pytest \
+    packaging \
+    boto3 \
+    botocore \
+    torch-scatter \
+    pyecharts \
+    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
+    && /opt/conda/bin/pip --no-cache-dir install \
+    --extra-index-url https://download.pytorch.org/whl/cu117 \
+    torch==${PYTORCH_VERSION}+cu117 \
+    torchvision==${TORCHVISION_VERSION}+cu117 \
+    torchaudio==${TORCHAUDIO_VERSION}
+
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+ARG FLASH_ATTEN_TAG
+
+RUN git submodule update --init --recursive \
+    && cd /InternLM/third_party/flash-attention \
+    && git checkout ${FLASH_ATTEN_TAG} \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-ubuntu
+++ b/experiment/Dockerfile-ubuntu
@ -0,0 +1,142 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on ubuntu
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        ninja-build
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG PYTORCH_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHAUDIO_VERSION
+
+RUN /opt/conda/bin/pip --no-cache-dir install \
+    transformers==4.29.2 \
+    sentencepiece \
+    numpy \
+    tqdm \
+    psutil \
+    packaging \
+    pre-commit \
+    ninja \
+    gputil \
+    pytest \
+    packaging \
+    boto3 \
+    botocore \
+    torch-scatter \
+    pyecharts \
+    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
+    && /opt/conda/bin/pip --no-cache-dir install \
+    --extra-index-url https://download.pytorch.org/whl/cu117 \
+    torch==${PYTORCH_VERSION}+cu117 \
+    torchvision==${TORCHVISION_VERSION}+cu117 \
+    torchaudio==${TORCHAUDIO_VERSION}
+
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+ARG FLASH_ATTEN_TAG
+
+RUN git submodule update --init --recursive \
+    && cd /InternLM/third_party/flash-attention \
+    && git checkout ${FLASH_ATTEN_TAG} \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/README-CN.md
+++ b/experiment/README-CN.md
@ -0,0 +1,25 @@
+## 实验性环境镜像
+本模块用于测试新版本环境，默认测试新环境 torch=2.0.1，flash-attention=2.1.0。新环境可能具有不稳定性，标准环境安装请参考：[安装文档](../doc/install.md)
+
+### 镜像构建及拉取
+构建镜像时请于 InternLM 根目录下执行 docker.Makefile，该文件与标准环境镜像共用，所使用的 Dockerfile 位于 experiment 目录下。也可直接从 https://hub.docker.com/r/internlm/internlm 拉取镜像，命令如下：
+```bash
+# 构建镜像
+# ubuntu20.04
+make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+# centos7
+make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+
+# 拉取镜像
+# ubuntu20.04
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
+# centos7
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
+```
+
+### 容器启动
+对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
+```
+容器内默认目录即 `/InternLM`，根据[使用文档](../doc/usage.md)即可启动训练。
--- a/experiment/README-EN.md
+++ b/experiment/README-EN.md
@ -0,0 +1,25 @@
+## Environment Image for experiment
+This module is used to test the new version environment, the default test new environment is torch=2.0.1, flash-attention=2.1.0. The new environment may be unstable, for the standard environment installation please refer to: [installation guide](../doc/en/install.md)
+
+### Build and Pull Image
+When building the image, please make docker.Makefile in the InternLM root directory. This Makefile is shared with the standard environment image, and the Dockerfile used is located in the experiment directory. You can also pull the image directly from https://hub.docker.com/r/internlm/internlm, the command is as follows:
+```bash
+# Build Image
+# ubuntu20.04
+make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+# centos7
+make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+
+# Pull Image
+# ubuntu20.04
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
+# centos7
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
+```
+
+### Run Container
+For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
+```
+The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
--- a/internlm/core/context/init.py
+++ b/internlm/core/context/init.py
@ -7,6 +7,7 @@ from .parallel_context import (
 from .process_group_initializer import (
    Initializer_Data,
    Initializer_Model,
+    Initializer_Nettest,
    Initializer_Pipeline,
    Initializer_Tensor,
    Initializer_Zero1,
@ -34,6 +35,7 @@ __all__ = [
    "Initializer_Pipeline",
    "Initializer_Data",
    "Initializer_Zero1",
+    "Initializer_Nettest",
    "ProcessGroupInitializer",
    "Initializer_Model",
    "seed",
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@ -18,6 +18,7 @@ import torch.distributed as dist

 from internlm.utils.common import SingletonMeta
 from internlm.utils.logger import get_logger
+from internlm.utils.timeout import LLM_NCCL_TIMEOUT

 from . import process_group_initializer as pgroup_initializer
 from .process_group_initializer import ParallelMode
@ -100,7 +101,7 @@ class Config(dict):

        module_name = filepath.stem
        source_file = SourceFileLoader(fullname=str(module_name), path=str(filepath))
-        module = source_file.load_module()  # pylint: disable=W4902,E1120, W1505:
+        module = source_file.load_module()  # pylint: disable=W4902,E1120,W1505

        # load into config
        config = Config()
@ -143,6 +144,7 @@ class ParallelContext(metaclass=SingletonMeta):
        self.pipeline_parallel_size = 1
        self.tensor_parallel_size = 1
        self.zero1_parallel_size = -1
+        self.nettest_parallel_size = 1
        self.expert_parallel_size = -1
        self.num_processes_on_current_node = -1
        self.virtual_pipeline_parallel_size = None
@ -374,12 +376,22 @@ class ParallelContext(metaclass=SingletonMeta):
        """
        # initialize the default process group
        init_method = f"tcp://[{host}]:{port}"
-        dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method)
+        dist.init_process_group(
+            rank=rank,
+            world_size=world_size,
+            backend=backend,
+            init_method=init_method,
+            timeout=LLM_NCCL_TIMEOUT,
+        )

        # None will give the default global process group for pytorch dist operations
        ranks = list(range(world_size))
        if use_cpu:
-            cpu_group = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else None
+            cpu_group = (
+                dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                if dist.get_backend() != "gloo"
+                else None
+            )
        else:
            cpu_group = None
        self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
@ -443,6 +455,9 @@ class ParallelContext(metaclass=SingletonMeta):
        # instead, it should be calculated based on other parallel config
        self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)

+        # the recommended nettest_parallel_size is 32 GPUs
+        self.nettest_parallel_size = 32
+
        # TODO : data parallel size can be different with expert parallel size
        self.expert_parallel_size = self.data_parallel_size

@ -458,6 +473,7 @@ class ParallelContext(metaclass=SingletonMeta):
            self.pipeline_parallel_size,
            self.tensor_parallel_size,
            self.zero1_parallel_size,
+            self.nettest_parallel_size,
            self.expert_parallel_size,
        ]

@ -467,6 +483,7 @@ class ParallelContext(metaclass=SingletonMeta):
        initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
        if self.pipeline_parallel_size > 1:
            initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
        if self.config.model.num_experts > 1:
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@ -3,11 +3,14 @@

 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context

+import math
 from abc import ABC, abstractmethod
 from enum import Enum

 import torch.distributed as dist

+from internlm.utils.timeout import LLM_NCCL_TIMEOUT
+

 # parallel modes
 class ParallelMode(Enum):
@ -31,15 +34,18 @@ class ParallelMode(Enum):
    # zero1 parallel
    ZERO1 = "zero1"

+    # runntime network test
+    NETTEST = "nettest"
+
+    # dummy mode, only used during mode construction
+    DUMMY = "dummy"
+
    # expert parallel
    EXPERT = "expert"

    # expert data parallel
    EXPERT_DATA = "expert_data"

-    # dummy mode, only used during mode construction
-    DUMMY = "dummy"
-

 class ProcessGroupInitializer(ABC):
    """An object, knowing the parallelism configuration, that initializes parallel groups.
@ -62,6 +68,7 @@ class ProcessGroupInitializer(ABC):
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        zero1_parallel_size: int,
+        nettest_parallel_size: int,
        expert_parallel_size: int,
    ):
        self.rank = rank
@ -70,6 +77,7 @@ class ProcessGroupInitializer(ABC):
        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        self.zero1_parallel_size = zero1_parallel_size
+        self.nettest_parallel_size = nettest_parallel_size
        self.expert_parallel_size = expert_parallel_size
        super().__init__()

@ -113,9 +121,13 @@ class Initializer_Data(ProcessGroupInitializer):

        for i in range(self.rank_num_per_dp_group):
            ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)]
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None

@ -166,9 +178,13 @@ class Initializer_Model(ProcessGroupInitializer):

        for i in range(self.num_group):
            ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)]
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None

@ -227,9 +243,13 @@ class Initializer_Pipeline(ProcessGroupInitializer):
                    )
                )
                pipe_group_size = len(ranks)
-                pipe_group = dist.new_group(ranks)
+                pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
                if use_cpu:
-                    group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else pipe_group
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else pipe_group
+                    )
                else:
                    group_cpu = None

@ -278,9 +298,13 @@ class Initializer_Tensor(ProcessGroupInitializer):

        for i in range(self.num_tensor_parallel_group):
            ranks = [i * self.tensor_parallel_size + j for j in range(self.tensor_parallel_size)]
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None

@ -335,9 +359,13 @@ class Initializer_Zero1(ProcessGroupInitializer):
                    i + (j * self.zero1_parallel_size + k) * self.rank_num_per_dp_group
                    for k in range(self.zero1_parallel_size)
                ]
-                group = dist.new_group(ranks)
+                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
                if use_cpu:
-                    group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else group
+                    )
                else:
                    group_cpu = None

@ -351,6 +379,59 @@ class Initializer_Zero1(ProcessGroupInitializer):
        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode


+class Initializer_Nettest(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for network test, especailly for NCCL.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        nettest_parallel_size (int): Size of a network test group.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Tensor parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.NETTEST
+
+        for i in range(self.num_nettest_group):
+            ranks = []
+            for j in range(self.nettest_parallel_size):
+                rank = i * self.nettest_parallel_size + j
+                if rank < self.world_size:
+                    ranks.append(rank)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
 class Initializer_Expert(ProcessGroupInitializer):
    """A ProcessGroupInitializer for expert parallelism.

--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@ -10,6 +10,7 @@ import torch
 from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
 from internlm.utils.common import conditional_context
+from internlm.utils.timeout import llm_timeout

 from .base_scheduler import BaseScheduler, SchedulerHook

@ -26,13 +27,13 @@ class NonPipelineScheduler(BaseScheduler):
        gradient_accumulation_steps(int, optional): the steps of gradient accumulation, 1 for disable
            gradient accumulation.

-    Example:
-        # this shows an example of customized data_process_func
-        def data_process_func(dataloader_output):
-            item1, item2, item3 = dataloader_output
-            data = (item1, item2)
-            label = item3
-            return data, label
+    Examples:
+        >>> # this shows an example of customized data_process_func
+        >>> def data_process_func(dataloader_output):
+        >>>     item1, item2, item3 = dataloader_output
+        >>>     data = (item1, item2)
+        >>>     label = item3
+        >>>     return data, label
    """

    def __init__(
@ -131,6 +132,7 @@ class NonPipelineScheduler(BaseScheduler):

        return output, loss, moe_loss

+    @llm_timeout(func_name="nopp_forward_backward_step")
    def forward_backward_step(
        self,
        engine: Engine,
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@ -16,6 +16,7 @@ from internlm.core.engine import Engine
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_current_device, move_to_device
 from internlm.utils.logger import get_logger
+from internlm.utils.timeout import llm_timeout

 from .base_scheduler import BaseScheduler, SchedulerHook

@ -635,6 +636,7 @@ class PipelineScheduler(BaseScheduler):

        return output, label, accum_loss, accum_moe_loss

+    @llm_timeout(func_name="nointerleaved_forward_backward_step")
    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
@ -1127,8 +1129,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        1. Perform the forward pass.
        2. Perform the backward pass.
        3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration
-           to the previous stage,
-        and receive the forward and backward inputs for the next iteration.
+           to the previous stage, and receive the forward and backward inputs for the next iteration.

        Args:
            engine (Engine): The engine to use for computation.
@ -1304,6 +1305,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        # 3. Cooldown
        self._run_cooldown_loop(engine, num_microsteps, num_1f1b_micropairs=num_1f1b_micropairs)

+    @llm_timeout(func_name="interleaved_forward_backward_step")
    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
        """Run interleaved 1F1B schedule (model split into model chunks), with
        communication between pipeline stages as needed.
--- a/internlm/core/trainer.py
+++ b/internlm/core/trainer.py
@ -23,7 +23,15 @@ class TrainState:
        train_dl (DataLoader): The DataLoader object used for training.
    """

-    def __init__(self, config) -> None:
+    def __init__(self, config, batch_sampler) -> None:
+        """
+        Args:
+            config (Config): internlm config
+            batch_sampler (torch.utils.data.Sampler): Because the dataloader loading is
+            asynchronous and prefetched, the batch_sampler state maintained inside the
+            dataloader are faster then the actual training progress, so we copy the
+            batch_sampler as the anchor point of ckpt reload.
+        """
        # The number of batches produced by the data iterator
        self.batch_count: int = 0
        # Used to store the number of samples consumed in the current epoch
@ -43,9 +51,20 @@ class TrainState:

        self.tensorboard_folder = config.tensorboard_folder

-    def init_batch_sampler(self, train_dl):
-        # Copy of the batch sampler from the DataLoader
-        self.batch_sampler = train_dl.batch_sampler.copy()
+        # learning rate
+        self.lr = config.adam.lr
+
+        # smapler state
+        if batch_sampler:
+            self.init_batch_sampler(batch_sampler)
+
+    def init_batch_sampler(self, batch_sampler):
+        """
+        Args:
+            batch_sampler (torch.utils.data.Sampler): sampler.
+        """
+        # make a copy of batch_sampler.
+        self.batch_sampler = batch_sampler.copy()
        # Iterator for the batch sampler
        self.batch_sampler_iter = iter(self.batch_sampler)

@ -61,25 +80,22 @@ class TrainState:

        return json.dumps(info, indent=4, sort_keys=True)

-    def load_state_dict(self, other_stuffs, train_dl):
+    def load_state_dict(self, other_stuffs):
        """
        Resumes training from a checkpoint.

        Args:
            other_stuffs (dict): Other information needed to resume training.
-            train_dl (DataLoader): The DataLoader object used for training.
        """
-
-        self.batch_count = other_stuffs["batch_count"] + 1  # here you need to shift a batch backward
        self.num_consumed_samples_in_epoch = other_stuffs["num_consumed_samples_in_epoch"]
        self.num_consumed_tokens = other_stuffs["num_consumed_tokens"]
        self.inf_nan_skip_batches = other_stuffs["inf_nan_skip_batches"]
-        # compatible with previous checkpoints without this parameter
-        self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1

-        # track the actual updates of sampler when using weighted sampling
-        self.batch_sampler = train_dl.batch_sampler.copy()
-        self.batch_sampler_iter = iter(self.batch_sampler)
+        # Because the ckpt save occurs after updating 'step_count',
+        # there is no need to increment 'step_count' here (Does our step count start from 0 ?),
+        # However, 'batch_count' is updating before ckpt storage, so it need to inc 1 when resume.
+        self.batch_count = other_stuffs["batch_count"] + 1  # here you need to shift a batch backward
+        self.step_count = other_stuffs.get("step_count", self.batch_count)

        # resume tensorboard from older tensorboard_folder
        self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
@ -130,10 +146,12 @@ class Trainer:

    @property
    def engine(self):
+        """Returns the engine that responsible for managing the training and evaluation process."""
        return self._engine

    @property
    def schedule(self):
+        """Returns the runtime scheduler."""
        return self._schedule

    @property
@ -142,15 +160,19 @@ class Trainer:
        return isinstance(self._schedule, (PipelineScheduler, InterleavedPipelineScheduler))

    def train(self):
+        """Sets the model to training mode."""
        self._engine.train()

    def eval(self):
+        """Sets the model to evaluation mode."""
        self._engine.eval()

    def zero_grad(self):
+        """Sets the gradient of all parameters in the model to zero."""
        self._engine.zero_grad()

    def step(self):
+        """Executes the parameter update step."""
        return self._engine.step()

    def execute_schedule(self, data_iter: Iterable, **kwargs):
--- a/internlm/initialize/initialize_tensor.py
+++ b/internlm/initialize/initialize_tensor.py
@ -3,16 +3,15 @@

 import math

-import torch
 from torch import Tensor, nn


-def scaled_init_method_normal(sigma, num_layers):
+def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
    std = sigma / math.sqrt(2.0 * num_layers)

    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        return nn.init.normal_(tensor, mean=0.0, std=std)

    return init_

@ -32,3 +31,33 @@ def normal_(mean: float = 0.0, std: float = 1.0):
        return nn.init.normal_(tensor, mean, std)

    return initializer
+
+
+def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
+    """Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    a = math.sqrt(3.0 * std)
+
+    def init_(tensor):
+        return nn.init.uniform_(tensor, -a, a)
+
+    return init_
+
+
+def uniform_(mean: float = 0.0, std: float = 1.0):
+    r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
+
+     .. math::
+        \mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
+
+    Args:
+        mean (float): the mean of the uniform distribution. Defaults 0.0.
+        std (float): the standard deviation of the uniform distribution. Defaults 1.0.
+    """
+
+    a = math.sqrt(3.0 * std)
+
+    def initializer(tensor: Tensor):
+        return nn.init.uniform_(tensor, mean - a, mean + a)
+
+    return initializer
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@ -43,8 +43,8 @@ def initialize_trainer(
    loaded into gpc.config.

    Args:
-        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
-        optimizer (:class:`BaseOptimizer`.
+        model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
+        optimizer (:class:`BaseOptimizer`): Your optimizer for training.
        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -10,9 +10,10 @@ import torch

 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
+from internlm.monitor import initialize_light_monitor
 from internlm.utils.common import get_master_node
 from internlm.utils.logger import get_logger
-from internlm.utils.storage_manager import init_storage_manager
+from internlm.utils.timeout import llm_timeout

 logger = get_logger(__file__)

@ -22,7 +23,7 @@ def get_default_parser():
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.

    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+       Parser: Returns the parser with the default arguments, the user may add customized arguments into this parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, help="path to the config file")
@ -97,6 +98,13 @@ def args_sanity_check():
    if "valid_every" not in data:
        data._add_item("valid_every", 0)

+    if "empty_cache_and_diag_interval" not in data:
+        data._add_item("empty_cache_and_diag_interval", 50)
+
+    if "diag_outlier_ratio" not in data:
+        data._add_item("diag_outlier_ratio", 1.1)
+    data.diag_outlier_ratio = max(1, data.diag_outlier_ratio)
+
    if gpc.is_rank_for_log():
        logger.info("+" * 15 + " Data Info " + "+" * 15)  # pylint: disable=W1201
        logger.info(f"seq_len: {data.seq_len}")
@ -111,7 +119,7 @@ def args_sanity_check():
    # processing the checkpoint config
    ckpt = gpc.config.ckpt
    if "enable_save_ckpt" not in ckpt:
-        ckpt._add_item("enable_save_ckpt", False)
+        ckpt._add_item("enable_save_ckpt", True)

    # Saving checkpoint args.
    if ckpt.enable_save_ckpt:
@ -137,9 +145,6 @@ def args_sanity_check():
        if not ckpt.async_upload:
            ckpt._add_item("async_upload_tmp_folder", None)

-        if "snapshot_ckpt_folder" not in ckpt:
-            ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))
-
        if "oss_snapshot_freq" not in ckpt:
            ckpt._add_item("oss_snapshot_freq", float("inf"))  # if oss_snapshot_freq not given, we disable.
    else:
@ -149,44 +154,23 @@ def args_sanity_check():
        ckpt._add_item("async_upload", False)
        ckpt._add_item("async_upload_tmp_folder", None)
        ckpt._add_item("snapshot_ckpt_folder", None)
-        ckpt._add_item("snapshot_ckpt_folder", None)
-
-    # Loading checkpoint args.
-    if "load_model_only_folder" not in ckpt:
-        ckpt._add_item("load_model_only_folder", None)

    if "load_ckpt_folder" not in ckpt:
        ckpt._add_item("load_ckpt_folder", None)

-    if "load_optimizer" not in ckpt:
-        ckpt._add_item("load_optimizer", True)
-
    if "stop_file_path" not in ckpt:
        ckpt._add_item("stop_file_path", None)

-    if "load_given_ckpt" not in ckpt:
-        # If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
+    if "auto_resume" not in ckpt:
+        # If 'auto_resume' is not given, we set it to True, so internlm can have opportunity
        # to auto-load latest checkpoint.
-        ckpt._add_item("load_given_ckpt", False)
-
-    if ckpt.load_given_ckpt:
-        # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
-        if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
-            logger.warning(
-                "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
-and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
-            )
-            ckpt.load_model_only_folder = None
+        ckpt._add_item("auto_resume", True)

    if gpc.is_rank_for_log():
        logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
        logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
        logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
        logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
-        logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")
-
-    # initialization storage manager
-    init_storage_manager(ckpt)

    # tensorboard writer config
    if "enable_tb" not in gpc.config:
@ -279,9 +263,22 @@ and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
            gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
        ), "sequence parallel does not support use_flash_attn=False"

-    # feishu webhook address for alerting
-    if "alert_address" not in gpc.config:
-        gpc.config._add_item("alert_address", None)
+    # monitoring default config
+    monitor_default_config = {
+        "alert_address": None,  # compatible with old alert config
+        "monitor": {  # new monitoring config
+            "alert": {"enable_feishu_alert": False, "feishu_alert_address": None, "light_monitor_address": None}
+        },
+    }
+
+    for key, value in monitor_default_config.items():
+        if key not in gpc.config:
+            gpc.config._add_item(key, value)
+
+    alert = gpc.config.monitor.alert
+
+    if alert.enable_feishu_alert and not alert.feishu_alert_address and gpc.is_rank_for_log():
+        logger.warning("alert is enable but alert_address is not set")

    optim_ckpt = gpc.config.hybrid_zero_optimizer
    if "zero_overlap_communication" in optim_ckpt:
@ -431,6 +428,7 @@ def launch_from_torch(
    )


+@llm_timeout(func_name="initialize_distributed_env")
 def initialize_distributed_env(
    config: str,
    launcher: str = "slurm",
@ -464,3 +462,20 @@ def initialize_distributed_env(

    if args_check:
        args_sanity_check()
+
+    # init light monitor client
+    alert_config = gpc.config.monitor.alert
+    if alert_config.enable_feishu_alert and gpc.is_rank_for_log():
+        light_monitor_address = alert_config.light_monitor_address
+        if light_monitor_address:
+            initialize_light_monitor(light_monitor_address)
+        else:
+            logger.warning("monitor address is none, monitor could not be used!")
+
+
+def get_config_value(config, key, defalut):
+    try:
+        value = config[key]
+    except KeyError:
+        value = defalut
+    return value
--- a/internlm/initialize/legacy/init.py
+++ b/internlm/initialize/legacy/init.py
--- a/internlm/initialize/legacy/launch.py
+++ b/internlm/initialize/legacy/launch.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from internlm.initialize.launch import get_config_value
+from internlm.utils.logger import get_logger
+
+logger = get_logger(__file__)
+
+
+def auto_resume_sanity_check(ckpt_config):
+    load_given_ckpt = get_config_value(ckpt_config, "load_given_ckpt", None)
+    if load_given_ckpt is None:
+        return True  # default value is True
+    else:
+        return not load_given_ckpt
+
+
+def ckpt_info_sanity_check(ckpt_config):
+    load_ckpt_folder = get_config_value(ckpt_config, "load_ckpt_folder", None)
+
+    load_model_only_folder = get_config_value(ckpt_config, "load_model_only_folder", None)
+
+    if load_model_only_folder is not None:
+        assert (
+            load_ckpt_folder is None
+        ), "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
+# and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
+        return dict(path=load_model_only_folder, content=("model",), ckpt_type="internlm")
+    else:
+        load_optimizer = get_config_value(ckpt_config, "load_optimizer", True)
+
+        if isinstance(load_ckpt_folder, str):
+            if load_optimizer:
+                return dict(path=load_ckpt_folder, content=("model", "sampler", "optimizer"), ckpt_type="internlm")
+            else:
+                return dict(path=load_ckpt_folder, content=("model", "sampler"), ckpt_type="internlm")
+        elif load_ckpt_folder is None:
+            return None
+        else:
+            assert f"Unsupport data type:'{type(load_ckpt_folder)}' for config.ckpt arg: 'load_ckpt_folder'"
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@ -137,15 +137,13 @@ class RotaryEmbedding(torch.nn.Module):
        """ """
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
-        self.register_buffer("inv_freq", inv_freq)
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
        self.scale_base = scale_base
-        scale = (
+        self.scale = (
            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
            if scale_base > 0
            else None
        )
-        self.register_buffer("scale", scale)

        self._seq_len_cached = 0
        self._cos_cached = None
@ -220,3 +218,15 @@ class RotaryEmbedding(torch.nn.Module):
                self._cos_k_cached[seqlen_offset:],
                self._sin_k_cached[seqlen_offset:],
            )
+
+    def _single_forward(self, x, indexes=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, indexes)
+        x = x[None, ...]
+        ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
+        return ret
+
+    def _single_eval_forward(self, x, seqlen_offset=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
+        return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -9,7 +9,7 @@ from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 from flash_attn.utils.distributed import all_reduce, reduce_scatter
 from torch import nn

-from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.utils import fused_dense_func_torch

@ -195,12 +195,6 @@ class FeedForward(nn.Module):
            device=device,
            dtype=dtype,
        )
-        # need to assign tp attribute so that colossalai know it is tensor parallel module
-
-        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            for name in ["w1", "w2", "w3"]:
-                for param in getattr(self, name).parameters():
-                    setattr(param, IS_TENSOR_PARALLEL, True)

    def forward(self, x):
        out = self.w3(F.silu(self.w1(x)) * self.w2(x))
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@ -176,7 +176,7 @@ class AccPerplex:
            res.update(ds_acc)
            res.update(ds_tokens)

-        loss_res = self.loss_with_type_id.get_metric()
+        loss_res = self.loss_with_type_id.get_metric(reset)
        res.update(loss_res)

        return res
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -548,7 +548,7 @@ def build_model_with_cfg(
    moe_use_residual: bool = False,
 ):
    """
-    Builde model with config
+    Build model with config.

    Args:
        num_chunks (int): The number of partitions in pipeline parallel. 1 by default.
--- a/internlm/monitor/init.py
+++ b/internlm/monitor/init.py
@ -1,4 +1,11 @@
+from .alert import initialize_light_monitor, send_heartbeat
 from .monitor import initialize_monitor_manager, send_alert_message
 from .utils import set_env_var

-__all__ = ["send_alert_message", "initialize_monitor_manager", "set_env_var"]
+__all__ = [
+    "send_alert_message",
+    "initialize_monitor_manager",
+    "set_env_var",
+    "initialize_light_monitor",
+    "send_heartbeat",
+]
--- a/internlm/monitor/alert.py
+++ b/internlm/monitor/alert.py
@ -1,8 +1,59 @@
 import json
+import math
+import os
+import re
 import time
+from typing import Dict

 import requests

+from internlm.utils.logger import get_logger
+
+logger = get_logger(__file__)
+
+
+def initialize_light_monitor(monitor_address: str = None):
+    try:
+        from uniscale_monitoring import init_monitor
+
+        init_monitor(monitor_address)
+    except Exception as e:
+        logger.warning(f"init monitor meet error: {e}")
+
+
+def send_heartbeat(msg_type: str, msg: Dict):
+    def nan2none(v):
+        if isinstance(v, float) and math.isnan(v):
+            return None
+        return v
+
+    try:
+        from uniscale_monitoring import send_meta
+
+        data = {}
+        for k, v in msg.items():
+            if isinstance(v, Dict):
+                for k1, v1 in v.items():
+                    new_k = f"{k}_{k1}".split(" ")[0]
+                    new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
+                    data[new_k] = nan2none(v1)
+            else:
+                new_k = k.split(" ")[0]
+                new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
+                data[new_k] = nan2none(v)
+
+        if os.getenv("CLUSTER_NAME"):
+            data.update({"cluster": os.getenv("CLUSTER_NAME")})
+        if msg_type == "train_metrics":
+            data.update({"msg_type": "train_metrics"})
+        elif msg_type == "init_time":
+            data.update({"msg_type": "init_time"})
+        elif msg_type == "stage_time":
+            data.update({"msg_type": "stage_time"})
+        send_meta(data, timeout=0.1)
+    except Exception as e:
+        logger.warning(f"send heartbeat meet error: {e}")
+

 def send_feishu_msg_with_webhook(webhook: str, title: str, message: str):
    """
--- a/internlm/monitor/monitor.py
+++ b/internlm/monitor/monitor.py
@ -211,6 +211,14 @@ monitor_manager = MonitorManager()

@contextmanager
 def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
+    """
+    Initialize monitor manager for monitoring training lifetime and alerting exception info to Feishu.
+
+    Args:
+        job_name (str): The training job name.
+        alert_address (str): The Feishu webhook address for sending alert messages.
+    """
+
    if alert_address is not None:
        try:
            monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address)
@ -218,9 +226,7 @@ def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} is starting.")
            yield
        finally:
-            send_alert_message(
-                address=gpc.config.alert_address, message=f"Training in {socket.gethostname()} completed."
-            )
+            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} completed.")
            monitor_manager.stop_monitor()
    else:
        yield
--- a/internlm/solver/optimizer/init.py
+++ b/internlm/solver/optimizer/init.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-from .hybrid_zero_optim import HybridZeroOptimizer
+from .hybrid_zero_optim import HybridZeroOptimizer, reload_zero_fp32_buff

-__all__ = ["HybridZeroOptimizer"]
+__all__ = ["HybridZeroOptimizer", "reload_zero_fp32_buff"]
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -33,6 +33,7 @@ from internlm.solver.optimizer.utils import (
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.timeout import llm_timeout

 from .utils import compute_norm

@ -125,6 +126,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        self._param_store = ParameterStore(ParallelMode.ZERO1)
        self._grad_store = GradientStore(ParallelMode.DATA)
        self._bucket_store = BucketStore(ParallelMode.DATA)
+        self._bucket_in_progress = []

        # fp16 and fp32 params for mixed precision training
        self._fp16_param_groups = dict()
@ -134,6 +136,8 @@ class HybridZeroOptimizer(BaseOptimizer):
        # self._overlap_communication = overlap_communication
        self._reduce_bucket_size = reduce_bucket_size

+        self._comm_bcast_stream = torch.cuda.Stream()
+
        # gradient scaler
        self.grad_scaler = DynamicGradScaler(
            initial_scale=initial_scale,
@ -165,9 +169,6 @@ class HybridZeroOptimizer(BaseOptimizer):
        self._param_bcast_sync_handler = param_bcast_sync_handler
        if self._overlap_sync_param:
            assert self._param_bcast_sync_handler is not None
-            self._broadcast_comm_stream = torch.cuda.Stream()
-        else:
-            self._broadcast_comm_stream = torch.cuda.current_stream()

        # iterate over the param group in the optimizer
        # partition these param groups for data parallel training
@ -238,13 +239,6 @@ class HybridZeroOptimizer(BaseOptimizer):
        # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
        self.skip_grad_reduce = False

-        # initialize communication stream for
-        # communication-computation overlapping
-        if self._overlap_sync_grad:
-            self._comm_stream = torch.cuda.Stream()
-        else:
-            self._comm_stream = torch.cuda.current_stream()
-
        # reduction hook is only used if overlapping communication
        # if it is stage 1 without overlapping, no hook will be attached
        if self._overlap_sync_grad:
@ -406,34 +400,41 @@ class HybridZeroOptimizer(BaseOptimizer):

    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size):
        grad_buckets_by_dtype = split_half_float_double(grads)
-
+        next_bucket_list = []
+        # add parameters into bucket for reduction
        for tensor_list in grad_buckets_by_dtype:
            param_bucket = TensorBucket(size=bucket_size)
            for tensor in tensor_list:
                param_bucket.add_to_bucket(tensor, allow_oversize=True)
-                if param_bucket.is_full_or_oversized():
-                    self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
-                    param_bucket.empty()
            if not param_bucket.is_empty():
                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
+            next_bucket_list.append(param_bucket)
+
+        # wait for the completion of previouce bucket list reduction, and do unflatten_and_copy()
+        # here we can also overlap the communication with some memcpy operation caused by bucket.flatten()
+        for bucket in self._bucket_in_progress:
+            bucket.commu_handle.wait()
+            bucket.unflatten_and_copy()
+            bucket.empty()
+        self._bucket_in_progress = []
+        self._param_store.clear_grads_of_previous_reduced_params()
+
+        # after the completion of bucket list reduction, add new buckets into _bucket_in_progress
+        self._bucket_in_progress = next_bucket_list.copy()

    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
-        if self._overlap_sync_grad:
-            self._comm_stream.synchronize()
-            self._param_store.clear_grads_of_previous_reduced_params()
+        # flatten the tensors and do allreduce
+        bucket.flatten()
+        bucket.commu_handle = reduce_tensor(
+            tensor=bucket.get_flat_tensor(),
+            dtype=None,
+            dst_rank=reduce_rank,
+            parallel_mode=ParallelMode.DATA,
+        )

-        with torch.cuda.stream(self._comm_stream):
-            flat = bucket.flatten()
-            reduced_flat = reduce_tensor(
-                tensor=flat,
-                dtype=self.dtype,
-                dst_rank=reduce_rank,
-                parallel_mode=ParallelMode.DATA,
-            )
-
-            # update the reduced tensor
-            if reduce_rank is None or reduce_rank == self._zero_local_rank:
-                bucket.unflatten_and_copy(reduced_flat)
+        # update the reduced tensor
+        if reduce_rank is None or reduce_rank == self._zero_local_rank:
+            bucket.set_unflatten_and_copy_flag(flag=True)

    def _has_inf_or_nan(self, tensor):
        try:
@ -517,6 +518,7 @@ class HybridZeroOptimizer(BaseOptimizer):
            grads = [self.padding_grad]
            params = [self.padding_tensor]

+        norm = 0
        if self._clip_grad_norm > 0:
            # this norm is before scaling, it will be very large
            norm = compute_norm(
@ -549,6 +551,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        all_groups_norm = scaled_norm_tensor.item()
        return all_groups_norm

+    @llm_timeout(func_name="optim_step")
    def step(self, closure=None):
        """Performs a single optimization step.

@ -581,22 +584,29 @@ class HybridZeroOptimizer(BaseOptimizer):
                groups_norms.append(self._compute_norm_with_stage(group_id=group_id))

        # clear reduced grads
-        if self._overlap_sync_grad:
-            # grads in the last bucket is reduced
-            self._comm_stream.synchronize()
-            self._param_store.clear_grads_of_previous_reduced_params()
+        # grads in the last bucket is reduced
+        for bucket in self._bucket_in_progress:
+            bucket.commu_handle.wait()
+            bucket.unflatten_and_copy()
+            bucket.empty()
+        self._bucket_in_progress = []
+        self._param_store.clear_grads_of_previous_reduced_params()

        # compute norm for gradients in the last bucket
-        total_norms = []
+        total_norms = {}
        for group_id in range(self.num_param_groups):
+            group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
+            group_name = f"{group_id}_{group_name}"
            if self._is_moe_group(self.optim.param_groups[group_id]):
-                total_norms.append(self._compute_norm_with_moe_group(group_id=group_id))
+                total_norms[group_name] = self._compute_norm_with_moe_group(group_id=group_id)
            else:
-                total_norms.append(
-                    self._compute_norm_with_stage(
-                        group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id]
+                total_norms[group_name] = self._compute_norm_with_stage(
+                        group_id=group_id,
+                        last_bucket=True,
+                        last_stage=True,
+                        previous_norm=groups_norms[group_id]
                    )
-                )
+
        timer("sync_grad").start()
        self._sync_grad()
        timer("sync_grad").stop()
@ -608,28 +618,45 @@ class HybridZeroOptimizer(BaseOptimizer):

        # check for overflow
        found_inf = False
+        found_nan = False
        # if there is INF values in grades, compute_norm func would also returns -1
        # thus, we try to avoid call _check_overflow here
        # found_inf = self._check_overflow()
        # Because you may encounter inf when computing norm

-        if -1 in norms:
+        if -1 in norms.values():
            found_inf = True
+
+        if -2 in norms.values():
+            found_nan = True
+
        loss_scale = float(self.loss_scale.item())  # backup
        if gpc.config.model.dtype is not torch.float32:
            self.grad_scaler.update(found_inf)
+
        # update loss scale if overflow occurs
        if found_inf:
            if gpc.is_rank_for_log():
                logger.warning("Overflow occurs, please check it.")
                send_alert_message(
-                    address=gpc.config.alert_address,
+                    address=gpc.config.monitor.alert.feishu_alert_address,
                    message="Overflow occurs, please check it.",
                )
            self._grad_store._averaged_gradients = dict()
            self.zero_grad()
            return False, norms

+        if found_nan:
+            if gpc.is_rank_for_log():
+                logger.warning("Nan grad norm occurs, please check it.")
+                send_alert_message(
+                    address=gpc.config.monitor.alert.feishu_alert_address,
+                    message="Nan grad norm  occurs, please check it.",
+                )
+            self._grad_store._averaged_gradients = dict()
+            self.zero_grad()
+            return False, norms
+
        # copy the grad of fp16 param to fp32 param
        single_grad_partition_groups = []
        for group_id in range(self.num_param_groups):
@ -660,15 +687,20 @@ class HybridZeroOptimizer(BaseOptimizer):

        # unscale and clip grads
        # get the global norm
-        global_norm_groups = []
+        global_norm_groups = {}
        if self._clip_grad_norm > 0:
-            for norm in norms:
-                global_norm_groups.append(norm**0.5)
+            for group_name, norm in norms.items():
+                global_norm_groups[group_name] = norm**0.5

        # the following operations are performed only on the rank to which parameters are assigned.
        if gpc.config.model.dtype is not torch.float32:
-            if len(single_grad_partition_groups) != 0:
-                self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
+            if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
+                self._unscale_and_clip_grads(
+                    single_grad_partition_groups,
+                    list(global_norm_groups.values()),
+                    loss_scale,
+                )
+
        # update the parameters
        timer("step").start()

@ -687,14 +719,17 @@ class HybridZeroOptimizer(BaseOptimizer):
                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                    fp16_param.data.copy_(fp32_param)

-        with torch.cuda.stream(self._broadcast_comm_stream):
+        torch.cuda.synchronize()
+        with torch.cuda.stream(self._comm_bcast_stream):
            self.broadcast_params()

        timer("step").stop()

        # update gradients may not be needed here, because the sync_params function is used in initialization,
        # so synchronization is maintained
-        return True, [global_norm / loss_scale for global_norm in global_norm_groups]
+        for group_name, global_norm in global_norm_groups.items():
+            global_norm_groups[group_name] = global_norm / loss_scale
+        return True, global_norm_groups

    def broadcast_params(self):
        handles = []
@ -814,3 +849,17 @@ class HybridZeroOptimizer(BaseOptimizer):

        if "zero_devide_optim_plan" in states:
            self.params_per_rank_id_dict = states["zero_devide_optim_plan"]
+
+
+def reload_zero_fp32_buff(optimizer):
+    # If we use AMP optimizer, we need to update its fp32 buffer as newly loaded weights value.
+    # Or we must ensure that loading model weights must be done before zero is initialized.
+    if isinstance(optimizer, HybridZeroOptimizer):
+        for group_id, param_group in enumerate(optimizer.optim.param_groups):
+            if optimizer.param_group_has_params[group_id]:
+                # flatten fp16 params have already been updated by 'load_model_checkpoint'
+                fp16_flat_current_rank = optimizer._param_store.get_flat_fp16_param_by_rank_group(
+                    optimizer._zero_local_rank, group_id
+                )
+                # param_group["params"] is fp32 flatten optimizer states of this zero rank.
+                param_group["params"][0].data.copy_(fp16_flat_current_rank.float())
--- a/internlm/solver/optimizer/store.py
+++ b/internlm/solver/optimizer/store.py
@ -249,11 +249,17 @@ class ParameterStore(BaseStore):
        if not last_bucket:
            if group_id not in self._former_bucket_reduced_param:
                return [], []
-            return self._former_bucket_reduced_param[group_id], self._former_bucket_reduced_grad[group_id]
+            return (
+                self._former_bucket_reduced_param[group_id],
+                self._former_bucket_reduced_grad[group_id],
+            )
        else:
            if group_id not in self._last_bucket_reduced_param:
                return [], []
-            return self._last_bucket_reduced_param[group_id], self._last_bucket_reduced_grad[group_id]
+            return (
+                self._last_bucket_reduced_param[group_id],
+                self._last_bucket_reduced_grad[group_id],
+            )

    def reset_reduced_data_for_compute_norm(self):
        self._former_bucket_reduced_param = {}
@ -277,6 +283,9 @@ class TensorBucket:
        self._max_size = size
        self._current_size = 0
        self._bucket = []
+        self._flat_tensor = None
+        self._unflatten_and_copy_flag = False
+        self.commu_handle = None

    @property
    def max_size(self):
@ -292,6 +301,15 @@ class TensorBucket:
    def is_empty(self):
        return len(self._bucket) == 0

+    def set_unflatten_and_copy_flag(self, flag):
+        self._unflatten_and_copy_flag = flag
+
+    def get_unflatten_and_copy_flag(self):
+        return self._unflatten_and_copy_flag
+
+    def get_flat_tensor(self):
+        return self._flat_tensor
+
    def add_to_bucket(self, tensor, allow_oversize=False):
        tensor_size = tensor.numel()

@ -312,11 +330,14 @@ class TensorBucket:
    def empty(self):
        self._bucket = []
        self._size = 0
+        self._flat_tensor = None
+        self.commu_handle = None

    def flatten(self):
-        return _flatten_dense_tensors(self._bucket)
+        self._flat_tensor = _flatten_dense_tensors(self._bucket)

-    def unflatten_and_copy(self, flat_tensor):
-        unflattened_tensor_list = _unflatten_dense_tensors(flat_tensor, self._bucket)
-        for old, new in zip(self._bucket, unflattened_tensor_list):
-            old.copy_(new)
+    def unflatten_and_copy(self):
+        if self._unflatten_and_copy_flag:
+            unflattened_tensor_list = _unflatten_dense_tensors(self._flat_tensor, self._bucket)
+            for old, new in zip(self._bucket, unflattened_tensor_list):
+                old.copy_(new)
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@ -95,37 +95,34 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
    :type parallel_mode: ParallelMode, optional
    """
    # use the original dtype
-    if dtype is None:
-        dtype = tensor.dtype
+    # if dtype is None:
+    assert dtype is None
+    dtype = tensor.dtype

    # cast the data to specified dtype for reduce/all-reduce
-    if tensor.dtype != dtype:
-        tensor_to_reduce = tensor.to(dtype)
-    else:
-        tensor_to_reduce = tensor
+    # if tensor.dtype != dtype:
+    #     tensor_to_reduce = tensor.to(dtype)
+    # else:
+    #     tensor_to_reduce = tensor

-    world_size = gpc.get_world_size(parallel_mode)
+    # world_size = gpc.get_world_size(parallel_mode)
+    # tensor.div_(world_size)
    group = gpc.get_group(parallel_mode)
-    tensor_to_reduce.div_(world_size)

    # if rank is None, all reduce will be used
    # else, reduce is used
    use_all_reduce = dst_rank is None

    if use_all_reduce:
-        dist.all_reduce(tensor_to_reduce, group=group)
+        handle = dist.all_reduce(tensor=tensor, group=group, op=torch.distributed.ReduceOp.AVG, async_op=True)
    else:
        ranks_in_group = gpc.get_ranks_in_group(parallel_mode)
        global_rank = ranks_in_group[dst_rank]
-        dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
+        handle = dist.reduce(
+            tensor=tensor, dst=global_rank, group=group, op=torch.distributed.ReduceOp.AVG, async_op=True
+        )

-    # recover the original dtype
-    if tensor.dtype != dtype and tensor is not tensor_to_reduce:
-        local_rank = gpc.get_local_rank(parallel_mode)
-        if use_all_reduce or dst_rank == local_rank:
-            tensor.copy_(tensor_to_reduce)
-
-    return tensor
+    return handle


 def has_inf_or_nan(tensor):
@ -315,6 +312,9 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
    if total_norm == float("inf") or total_norm == -float("inf"):
        total_norm = -1

+    if math.isnan(total_norm):
+        total_norm = -2
+
    return total_norm


--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -26,7 +26,7 @@ from internlm.data.packed_dataset import (
 )
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
 from internlm.model.moe import create_moe_param_groups
-from internlm.monitor import set_env_var
+from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
@ -41,15 +41,19 @@ from internlm.utils.parallel import (
    sync_model_param_within_tp,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
+from internlm.utils.timeout import llm_timeout

 logger = get_logger(__file__)


+@llm_timeout(func_name="initialize_model")
 def initialize_model():
    """
-    Initialize model.
+    Initialize model with Automatic Mixed Precision.

-    Returns: The neural network model to be trained or evaluated.
+    Returns:
+        torch.nn.Module:
+            The neural network model to be trained or evaluated.
    """

    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
@ -89,14 +93,16 @@ def initialize_model():
    return model


+@llm_timeout(func_name="initialize_optimizer")
 def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    """
    Initialize optimizer.

    Args:
-        model (torch.nn.Module): Your model instance to be trained or evaluated.
+        model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated.

-    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
+    Returns:
+        A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
        param_bcast_sync_handler = ParamBcastSyncHandler(model)
@ -130,13 +136,21 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    return optimizer, beta2_scheduler, lr_scheduler


+@llm_timeout(func_name="get_train_data_loader")
 def get_train_data_loader(
    num_worker: int = 0, dataset_generate_func: Callable = None, train_sampler=None, train_collate_fn=None
 ):
    """
    Generate and return the training data loader.

-    Returns: A tuple of (train_dl, dataset_types).
+    Args:
+        num_worker (:class:`int`): number of subprocesses used for dataloader.
+        dataset_generate_func (:class:`Callable`, optional): generate function for dataset.
+        train_sampler (:class:`torch.utils.data.sampler`, optional): dataset sampler for training dataloader.
+        train_collate_fn (:class:`Callable`, optional): collate function for training dataloader.
+
+    Returns:
+        A tuple of (train_dl, dataset_types).
    """

    # Get the dataset types
@ -202,6 +216,7 @@ def get_train_data_loader(
    return train_dl, dataset_types


+@llm_timeout(func_name="get_validation_data_loader")
 def get_validation_data_loader(
    num_worker: int = 0, dataset_generate_func: Callable = None, val_collate_fn=None, dataloader_func=None
 ):
@ -263,6 +278,7 @@ def get_validation_data_loader(
    return val_dls


+@llm_timeout(func_name="load_new_batch")
 def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
    """
    Load and return the new batch data based on training data loader.
@ -320,6 +336,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
    )


+@llm_timeout(func_name="record_current_batch_training_metrics")
 def record_current_batch_training_metrics(
    get_tflops_func,
    logger,
@ -344,6 +361,7 @@ def record_current_batch_training_metrics(

    set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))

+    timer.store_last_timers()
    if success_update in (0, True):
        train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
    if is_no_pp_or_last_stage():
@ -373,12 +391,6 @@ def record_current_batch_training_metrics(

        tflops = get_tflops_func((time.time() - start_time))

-        # change grad_norm list to dict for calling writer's add_scalars
-        grad_norm_dict = {}
-        assert isinstance(grad_norm, list)
-        for inx, norm in enumerate(grad_norm):
-            grad_norm_dict[f"grad_norm_{inx}"] = norm
-
        infos = {
            "tflops": tflops,
            "step": batch_count,
@ -387,7 +399,7 @@ def record_current_batch_training_metrics(
            "tgs (tokens/gpu/second)": tk_per_gpu,
            "lr": lr,
            "loss_scale": scaler,
-            "grad_norm": grad_norm_dict,
+            "grad_norm": grad_norm,
        }

        infos["micro_num"] = len(batch[1])
@ -413,20 +425,23 @@ def record_current_batch_training_metrics(
            else:
                writer.add_scalar(key=key, value=value, step=train_state.step_count)

+        if gpc.config.monitor.alert.get("light_monitor_address", None) and batch_count % 50 == 0:
+            send_heartbeat("train_metrics", infos)
+
        if update_panel:
            # metrics shown with dashboard panels
            panel_metrics = {
                "step": batch_count,
                "lr": lr,
                "num_consumed_tokens": train_state.num_consumed_tokens,
-                "loss": loss.item() - moe_loss.item(),
+                "loss": loss.item() + moe_loss.item(),
                "flops": tflops,
                "tgs": tk_per_gpu,
                "acc": acc_perplex["acc"],
                "perplexity": acc_perplex["perplexity"],
                "fwd_bwd_time": fwd_bwd_time,
            }
-            for norm_key, norm_value in grad_norm_dict.items():
+            for norm_key, norm_value in grad_norm.items():
                panel_metrics[norm_key] = norm_value

            logger.info(
@ -438,4 +453,8 @@ def record_current_batch_training_metrics(
            logger.info(line)

        # if loss spike occurs, send alert info to feishu
-        mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
+        mm.monitor_loss_spike(
+            alert_address=gpc.config.monitor.alert.feishu_alert_address,
+            step_count=batch_count,
+            cur_step_loss=loss.item(),
+        )
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@ -76,7 +76,7 @@ def evaluate_on_val_dls(
        data_cfg = gpc.config.data

        for val_name, val_dl in val_dls.items():
-            if len(val_dl) == 0 and verbose and not streaming:
+            if not streaming and len(val_dl) == 0 and verbose:
                logger.info(f"Validation dataset: {val_name} is empty")
                continue

@ -136,7 +136,7 @@ def evaluate_on_val_dls(
            dist.barrier()

            val_res = val_metric.get_metric()
-            if verbose and len(val_dl) != 0:
+            if verbose and (streaming or len(val_dl) != 0):
                val_loss = val_loss / (val_idx + 1 + 1e-6)
                infos = {
                    "step": step_count,
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@ -0,0 +1,256 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+import socket
+
+import torch
+import torch.distributed as dist
+from flash_attn.modules.mha import FlashSelfAttention, SelfAttention
+from torch.utils import benchmark
+
+from internlm.monitor import send_alert_message
+from internlm.utils.logger import get_logger
+from internlm.utils.megatron_timers import megatron_timer as timer
+
+try:
+    import GPUtil
+    import psutil
+except ImportError:
+    GPUtil, psutil = None, None
+
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.utils.common import get_current_device
+
+logger = get_logger(__file__)
+
+
+def empty_cache_and_diag(batch_count, interval=50):
+    """empty cuda cache and run diag bench or tests."""
+    if interval <= 0:
+        interval = 50
+    if batch_count % int(interval) == 0:
+        # there is no need to do diag on the first batch
+        if batch_count > 0:
+            if gpc.is_rank_for_log():
+                logger.info("Empty Cache and Diagnosis GPU/NCCL/Timer ...")
+            with torch.no_grad():
+                timer_diagnosis()
+                bench_gpu()
+                bench_net()
+        # do empty_cache after the bench
+        torch.cuda.empty_cache()
+
+
+def benchmark_forward(
+    test_fn,
+    *inputs,
+    repeats=100,
+    amp=True,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            test_fn(*inputs, **kwinputs)
+
+    bench_timer = benchmark.Timer(
+        stmt="test_fn_amp(*inputs, **kwinputs)",
+        globals={"test_fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    used_time = bench_timer.timeit(repeats)
+    return used_time.mean
+
+
+def flops(batch, seqlen, headdim, nheads, time_f):
+    """Compute the flops value of a GPU with give flashattention function"""
+
+    flop = 4 * batch * seqlen**2 * nheads * headdim
+    return (flop / time_f / 10**12) if not math.isnan(time_f) else 0.0
+
+
+def get_gpu_temperature():
+    """Get current GPU temperature."""
+    try:
+        gpu_id = torch.cuda.current_device()
+    except AssertionError:
+        gpu_id = -1
+
+    if GPUtil is not None and gpu_id >= 0:
+        gpus = GPUtil.getGPUs()
+        gpu_temperature = gpus[gpu_id].temperature
+    else:
+        gpu_temperature = -1
+
+    return gpu_temperature
+
+
+def get_cpu_temperature():
+    """Get current CPU temperature."""
+
+    if psutil is not None:
+        cpu_temperature = psutil.sensors_temperatures()["coretemp"][0].current
+    else:
+        cpu_temperature = -1
+
+    return cpu_temperature
+
+
+def timer_diagnosis():
+    """Diagnosis running time"""
+
+    if len(timer.names) == 0 or len(timer.times) == 0:
+        return
+
+    world_size = gpc.get_world_size(ParallelMode.DATA)
+    if world_size < 2:
+        return
+
+    # if gpc.is_rank_for_log():
+    #     logger.info("Diagnosis running timers ...")
+
+    # detect slow rank compared to other ranks in the same DP group
+    running_time = torch.Tensor(timer.times).to(device=get_current_device())
+    avg_time = running_time.detach().clone()
+    if world_size <= 4:
+        dist.all_reduce(avg_time, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.DATA))
+    else:
+        running_time_max = avg_time.detach().clone()
+        running_time_min = avg_time.detach().clone()
+        dist.all_reduce(running_time_max, op=torch.distributed.ReduceOp.MAX, group=gpc.get_group(ParallelMode.DATA))
+        dist.all_reduce(running_time_min, op=torch.distributed.ReduceOp.MIN, group=gpc.get_group(ParallelMode.DATA))
+        dist.all_reduce(avg_time, op=torch.distributed.ReduceOp.SUM, group=gpc.get_group(ParallelMode.DATA))
+        avg_time = (avg_time - running_time_max - running_time_min) / (world_size - 2)
+
+    diag_result = running_time > avg_time * gpc.config.data.diag_outlier_ratio
+    diag_result = diag_result.tolist()
+    avg_time = avg_time.tolist()
+
+    for slow, name, time, avg in zip(diag_result, timer.names, timer.times, avg_time):
+        if slow is False or avg < 0.5:
+            continue
+        msg = (
+            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} is slower than avg on {name}, "
+            f"Hostname {socket.gethostname()}, "
+            f"its time {time:.2f}, avg {avg:.2f}, "
+            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
+        )
+        logger.warning(msg)
+        send_alert_message(
+            address=gpc.config.monitor.alert.feishu_alert_address,
+            message=msg,
+        )
+
+    # detect slow rank compared to historical timer data
+    for name, time in zip(timer.names, timer.times):
+        if name not in timer.hist or len(timer.hist[name]) < 5:
+            continue
+        hist_avg = sum(timer.hist[name]) / len(timer.hist[name])
+        if time > hist_avg * gpc.config.data.diag_outlier_ratio and time > 0.5:
+            msg = (
+                f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} is slower than hist avg on {name}, "
+                f"Hostname {socket.gethostname()}, "
+                f"its time {time:.2f}, hist_avg {hist_avg:.2f}, "
+                f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
+            )
+            logger.warning(msg)
+            send_alert_message(
+                address=gpc.config.monitor.alert.feishu_alert_address,
+                message=msg,
+            )
+
+
+def bench_net():
+    """Benchmark nccl performance for slow node detection."""
+
+    if gpc.get_world_size(ParallelMode.GLOBAL) <= 1:
+        return
+
+    # if gpc.is_rank_for_log():
+    #     logger.info("benchmarking network speed ...")
+
+    repeats = 100
+    input_data = torch.randn(
+        8 * 1024 * 1024,
+        device=get_current_device(),
+        dtype=torch.bfloat16,
+    )
+
+    def allreduce_fn(inputs):
+        dist.all_reduce(inputs, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.NETTEST))
+
+    bench_timer = benchmark.Timer(
+        stmt="test_fn_amp(inputs)",
+        globals={"test_fn_amp": allreduce_fn, "inputs": input_data},
+        num_threads=torch.get_num_threads(),
+    )
+    allreduce_time = bench_timer.timeit(repeats).mean
+    allreduce_time = allreduce_time * 10**3
+    allreduce_time_this = allreduce_time
+    allreduce_time = torch.Tensor([allreduce_time]).to(device=get_current_device())
+    dist.all_reduce(allreduce_time, group=gpc.get_group(ParallelMode.GLOBAL))
+    allreduce_time_avg = allreduce_time / gpc.get_world_size(ParallelMode.GLOBAL)
+    allreduce_time_avg = float(allreduce_time_avg.item())
+
+    if allreduce_time_this >= allreduce_time_avg * gpc.config.data.diag_outlier_ratio:
+        msg = (
+            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} NCCL test is slower than avg, "
+            f"Hostname {socket.gethostname()}, "
+            f"allreduce_time {allreduce_time_this:.2f}, avg {allreduce_time_avg:.2f}, "
+            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
+        )
+        logger.warning(msg)
+        send_alert_message(
+            address=gpc.config.monitor.alert.feishu_alert_address,
+            message=msg,
+        )
+
+
+def bench_gpu(use_flash_attn=True):
+    """Benchmark single GPU performance for slow node detection."""
+
+    # if gpc.is_rank_for_log():
+    #     logger.info("benchmarking gpu speed ...")
+
+    headdim = 64
+    dim = 2048
+    batch_size, seqlen = 2, 1024
+    nheads = dim // headdim
+
+    inner_attn = FlashSelfAttention if use_flash_attn else SelfAttention
+    inner_attn = inner_attn(causal=True, softmax_scale=None, attention_dropout=0)
+
+    qkv = torch.randn(
+        batch_size,
+        seqlen,
+        3,
+        dim // headdim,
+        headdim,
+        device=get_current_device(),
+        dtype=torch.float16,
+        requires_grad=True,
+    )
+    time_f = benchmark_forward(inner_attn, qkv)
+    speed = flops(batch_size, seqlen, headdim, nheads, time_f)
+    speed_this = speed
+    speed = torch.Tensor([speed]).to(device=get_current_device())
+    dist.all_reduce(speed, group=gpc.get_group(ParallelMode.GLOBAL))
+    speed_avg = speed / gpc.get_world_size(ParallelMode.GLOBAL)
+    speed_avg = float(speed_avg.item())
+
+    if speed_this <= speed_avg / gpc.config.data.diag_outlier_ratio:
+        msg = (
+            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} GPU is slower than avg, "
+            f"Hostname {socket.gethostname()}, "
+            f"tflops {speed_this:.2f}, avg {speed_avg:.2f}, "
+            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
+        )
+        logger.warning(msg)
+        send_alert_message(
+            address=gpc.config.monitor.alert.feishu_alert_address,
+            message=msg,
+        )
--- a/internlm/utils/logger.py
+++ b/internlm/utils/logger.py
@ -84,7 +84,7 @@ def initialize_uniscale_logger(
            job_name and launch_time and file_name
        ), "If file_path is None, job_name, launch_time and file_name must be setted."
        log_file_name = file_name
-        log_folder = os.path.join(job_name, launch_time, "logs")
+        log_folder = os.path.join("RUN", job_name, launch_time, "logs")
        log_dir = os.path.join(log_folder, log_file_name)
        file_path = log_dir

--- a/internlm/utils/megatron_timers.py
+++ b/internlm/utils/megatron_timers.py
@ -16,8 +16,12 @@ class _Timer:
        self.start_time = time.time()
        self.stream = torch.cuda.current_stream()

-    def start(self):
+    def start(self, reset_all=True):
        """Start the timer."""
+        # need to reset all timers in a new batch
+        if self.name_ == "one-batch" and reset_all is True:
+            megatron_timer.reset()
+
        assert not self.started_, "timer has already been started"
        self.stream.synchronize()
        self.start_time = time.time()
@ -48,7 +52,7 @@ class _Timer:
            self.reset()
        # If timing was in progress, set it back.
        if started_:
-            self.start()
+            self.start(reset_all=False)
        return elapsed_


@ -57,12 +61,29 @@ class Timers:

    def __init__(self):
        self.timers = {}
+        self.hist = {}
+        self.names = []
+        self.times = []

    def __call__(self, name):
        if name not in self.timers:
            self.timers[name] = _Timer(name)
        return self.timers[name]

+    def store_last_timers(self):
+        """Store timers to two list"""
+        self.names = []
+        self.times = []
+        for key, value in self.timers.items():
+            senconds = round(float(value.elapsed(reset=False)), 4)
+            self.names.append(key)
+            self.times.append(senconds)
+            if key not in self.hist:
+                self.hist[key] = []
+            self.hist[key].append(senconds)
+            if len(self.hist[key]) > 10:
+                self.hist[key].pop(0)
+
    def write(self, names, writer, iteration, normalizer=1.0, reset=False):
        """Write timers to a tensorboard writer"""
        # currently when using add_scalars,
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@ -2,41 +2,139 @@
 # -*- encoding: utf-8 -*-

 import copy
-import fcntl
+import inspect
 import os
 import re
 import socket
 import time
 from collections import defaultdict
 from enum import Enum
-from typing import Dict
+from typing import Callable, Dict, Union

 import torch

 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.trainer import TrainState
+from internlm.initialize.launch import get_config_value
+from internlm.initialize.legacy.launch import (
+    auto_resume_sanity_check,
+    ckpt_info_sanity_check,
+)
 from internlm.model.moe import MoE
 from internlm.monitor import send_alert_message
-from internlm.solver.optimizer import HybridZeroOptimizer
+from internlm.solver.optimizer import HybridZeroOptimizer, reload_zero_fp32_buff
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.storage_manager import (
    get_fns,
    get_storage_manager,
+    init_storage_manager,
    llm_load,
    llm_save,
+    try_get_storage_backend,
 )
+from internlm.utils.timeout import llm_timeout

 logger = get_logger(__file__)


-class CheckpointType(Enum):
+class CheckpointSaveType(Enum):
    NORMAL_CHECKPOINT = 1
    SNAPSHOT_CHECKPOINT = 2


+class CheckpointLoadType(Enum):
+    INTERNLM = "internlm"
+
+
+# The load method implemented by internlm by default does not use string representation types,
+# but uses enumeration types defined in advance.
+LOAD_TYPE_DICT = {
+    "internlm": CheckpointLoadType.INTERNLM,
+}
+
+
+class CheckpointLoadContent:
+    MODEL = "model"
+    SAMPLER = "sampler"
+    OPIMIZER = "optimizer"
+    SCHEDULAER = "scheduler"
+
+
+class CheckpointLoadMethod:
+    """The registration class of the checkpoint loading method,
+    users can define their own custom ckpt loading methods."""
+
+    LOAD_FUNC_SIG = None
+    LOAD_TYPE_FUNC = {}
+
+    @staticmethod
+    def convet_load_type(load_type: str) -> Union[CheckpointLoadType, str]:
+        if load_type.lower() in LOAD_TYPE_DICT:
+            # The ckpt load method implemented by internlm by default.
+            return LOAD_TYPE_DICT[load_type.lower()]
+        else:
+            # If it is a user-defined field, we do not do any conversion and represent it as a string.
+            return load_type
+
+    @staticmethod
+    def register_ckpt_load_type(load_type: Union[str, CheckpointLoadType], load_func: Callable):
+        if load_type in CheckpointLoadMethod.LOAD_TYPE_FUNC:
+            logger.warning(f"{load_type} has aleady been registed!")
+            return
+
+        CheckpointLoadMethod.LOAD_TYPE_FUNC.update({load_type: load_func})
+
+        if load_type == CheckpointLoadType.INTERNLM:
+            CheckpointLoadMethod.LOAD_FUNC_SIG = inspect.signature(load_func)
+        else:
+            if inspect.signature(load_func) != CheckpointLoadMethod.LOAD_FUNC_SIG:
+                logger.warning(
+                    f"registe load model ckpt signature is not same with: {CheckpointLoadMethod.LOAD_FUNC_SIG}"
+                )
+
+    @staticmethod
+    def get_ckpt_load_type_func(load_type: Union[str, CheckpointLoadType]):
+        return CheckpointLoadMethod.LOAD_TYPE_FUNC[load_type]
+
+
+class CheckpointLoadMask:
+    """
+    According to the content field in the incoming ckpt_info, decide which components to load.
+    """
+
+    LOAD_CONTENT_DICT = {
+        "model": CheckpointLoadContent.MODEL,
+        "sampler": CheckpointLoadContent.SAMPLER,
+        "optimizer": CheckpointLoadContent.OPIMIZER,
+        "scheduler": CheckpointLoadContent.SCHEDULAER,
+    }
+
+    def __init__(self, content: tuple) -> None:
+        self.load_set = set(map(lambda x: x.lower(), content))
+        if "all" in self.load_set:
+            self.load_set = set(CheckpointLoadMask.LOAD_CONTENT_DICT.values())
+        else:
+            self.load_set = set(map(lambda x: CheckpointLoadMask.LOAD_CONTENT_DICT[x.lower()], content))
+
+    def need_load(self, content: CheckpointLoadContent):
+        return content in self.load_set
+
+    def not_only_load(self, content: CheckpointLoadContent):
+        return content in self.load_set and len(self.load_set) > 1
+
+    def only_load(self, content: CheckpointLoadContent):
+        return set((content,)) == self.load_set
+
+    def __str__(self) -> str:
+        return f"{self.load_set}."
+
+    def __repr__(self) -> str:
+        return f"{self.load_set}."
+
+
 def get_model_topology(model):
    """
    Returns:
@ -58,6 +156,66 @@ def get_model_topology(model):
    return topos


+def try_load_internlm_ckpt(ckpt_mm, load_info, train_state: TrainState):
+    load_content_str = ""
+    load_ckpt_folder = load_info["path"]
+    load_content: CheckpointLoadMask = load_info["content"]
+
+    if gpc.is_rank_for_log():
+        logger.info(f"Try load_ckpt_folder: {load_ckpt_folder}")
+
+    if load_content.need_load(CheckpointLoadContent.MODEL):
+        load_model_checkpoint(folder=load_ckpt_folder, model=ckpt_mm.model)
+        load_content_str += f"{CheckpointLoadContent.MODEL}, "
+
+    if load_content.not_only_load(CheckpointLoadContent.MODEL):
+        # load training states.
+        load_context(load_ckpt_folder, train_state)
+
+        # load optimzier states.
+        if load_content.need_load(CheckpointLoadContent.OPIMIZER):
+            load_optimizer_checkpoint(load_ckpt_folder, ckpt_mm.optimizer)
+            load_content_str += f"{CheckpointLoadContent.OPIMIZER}, "
+        else:
+            if gpc.is_rank_for_log():
+                logger.warning("CheckpointManager has no 'optimizer', skip reload optim checkpoint!")
+
+        # load lr scheduler states.
+        if load_content.need_load(CheckpointLoadContent.SCHEDULAER):
+            if ckpt_mm.lr_scheduler:
+                load_scheduler(load_ckpt_folder, ckpt_mm.lr_scheduler, ckpt_mm.optimizer, train_state)
+                load_content_str += f"{CheckpointLoadContent.SCHEDULAER}, "
+            else:
+                if gpc.is_rank_for_log():
+                    logger.warning("CheckpointManager has no 'lr_scheduler', skip reload lr_scheduler checkpoint!")
+
+        # load dataloader sampler states.
+        if load_content.need_load(CheckpointLoadContent.SAMPLER):
+            if hasattr(train_state, "batch_sampler") and not isinstance(
+                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
+            ):
+                load_sampler(load_ckpt_folder, ckpt_mm.train_dl.batch_sampler)
+                # track the actual updates of sampler when using weighted sampling
+                train_state.init_batch_sampler(ckpt_mm.train_dl.batch_sampler)
+                load_content_str += f"{CheckpointLoadContent.SAMPLER}, "
+            else:
+                if gpc.is_rank_for_log():
+                    logger.warning("CheckpointManager skip reload 'batch_sampler'")
+
+            # reload data state dict.
+            if hasattr(train_state, "data_state_dict"):
+                ckpt_mm.train_dl.dataset.load_state_dict(
+                    llm_load(os.path.join(load_ckpt_folder, "sampler_0.pt")), ckpt_path=load_ckpt_folder
+                )
+                load_content_str += f"{CheckpointLoadContent.SAMPLER}, "
+            else:
+                if gpc.is_rank_for_log():
+                    logger.warning(
+                        "CheckpointManager has no 'data_state_dict', skip reload data_state_dict checkpoint!"
+                    )
+    return load_content_str
+
+
 def save_model_checkpoint(folder, model):
    """
    Save the model according to the relationship between tp and dp. The principle is that the data of each tp
@ -327,15 +485,16 @@ def load_sampler(ckpt_path: str, sampler):
    torch.cuda.empty_cache()


-def load_context(ckpt_path: str, train_dl, train_state: TrainState):
+def load_context(ckpt_path: str, train_state: TrainState):
    context_stuffs = llm_load(os.path.join(ckpt_path, "context.pt"))
-    train_state.load_state_dict(context_stuffs, train_dl)
+    train_state.load_state_dict(context_stuffs)
    if gpc.is_rank_for_log():
        logger.info(f"reload train_state:{train_state}")
    torch.cuda.empty_cache()


-def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, learning_rate, train_state: TrainState):
+def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, train_state: TrainState):
+    learning_rate = train_state.lr
    scheduler_states = llm_load(os.path.join(ckpt_path, "schedulder.pt"))
    if learning_rate != scheduler_states["base_lrs"][0] and gpc.is_rank_for_log():
        logger.warning(
@ -364,7 +523,17 @@ def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, learning_rate, train
 class CheckpointManager:
    """StorageManagerContext"""

-    def __init__(self, ckpt_config, model, model_config=None, model_config_file=None, feishu_address=None) -> None:
+    def __init__(
+        self,
+        ckpt_config,
+        model,
+        train_dl=None,
+        optimizer=None,
+        lr_scheduler=None,
+        model_config=None,
+        model_config_file=None,
+        feishu_address=None,
+    ) -> None:
        """
        CheckpointManager is used to decide when to store ckpt. If it is an asynchronous
        upload mode, you must call wait_async_upload_finish at the end of the program to wait
@ -377,22 +546,44 @@ class CheckpointManager:
            lr_scheduler (object): lr_scheduler obj.
            model_config (dict): model config.
        """
-        self.enable_save_ckpt = ckpt_config.enable_save_ckpt
-        self.checkpoint_every = ckpt_config.checkpoint_every
-        self.save_ckpt_folder = ckpt_config.save_ckpt_folder
-        self.snapshot_ckpt_folder = ckpt_config.snapshot_ckpt_folder
-        self.oss_snapshot_freq: int = ckpt_config.oss_snapshot_freq
-        self.stop_file_path = ckpt_config.stop_file_path
-        self.load_model_only_folder = ckpt_config.load_model_only_folder
+        self.enable_save_ckpt = get_config_value(ckpt_config, "enable_save_ckpt", False)
+        self.checkpoint_every = get_config_value(ckpt_config, "checkpoint_every", 100)
+        self.save_ckpt_folder = get_config_value(ckpt_config, "save_ckpt_folder", None)
+        self.oss_snapshot_freq: int = get_config_value(ckpt_config, "oss_snapshot_freq", 50)
+        self.stop_file_path = get_config_value(ckpt_config, "stop_file_path", None)
+        if self.save_ckpt_folder:
+            self.snapshot_ckpt_folder = get_config_value(
+                ckpt_config, "snapshot_ckpt_folder", os.path.join(self.save_ckpt_folder, "snapshot")
+            )
+            self.async_upload_tmp_folder = get_config_value(
+                ckpt_config, "async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/"
+            )
+        else:
+            self.snapshot_ckpt_folder = None
+            self.async_upload_tmp_folder = None
+
+        self.async_upload = get_config_value(ckpt_config, "async_upload", False)
+
+        # initialization storage manager
+        init_storage_manager(self.enable_save_ckpt, self.async_upload_tmp_folder, self.async_upload)
+
        self.feishu_address = feishu_address
        self.storage_manager = get_storage_manager()
        self.snapshot_counter = 0
-        self.load_optimizer = gpc.config.ckpt.load_optimizer

        self.model = model
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.train_dl = train_dl
        self.model_config = model_config
        self.model_config_file = model_config_file

+        # Register defalut internlm ckpt load type.
+        self.defalut_load_type_func = {CheckpointLoadType.INTERNLM: try_load_internlm_ckpt}
+        for ckpt_load_type in CheckpointLoadType:
+            CheckpointLoadMethod.register_ckpt_load_type(ckpt_load_type, self.defalut_load_type_func[ckpt_load_type])
+
+        # Init alter file.
        if self.stop_file_path and gpc.get_global_rank() == 0:
            dir_path = os.path.dirname(self.stop_file_path)
            if dir_path != "" and not os.path.exists(dir_path):
@ -400,21 +591,35 @@ class CheckpointManager:
            with open(self.stop_file_path, "w", encoding="utf-8") as f:
                f.write("0")

-        if ckpt_config.load_given_ckpt is False:
-            # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
-            latest_ckpt_path = self.query_lastest_ckpt()
-            if latest_ckpt_path:
-                self.load_ckpt_folder = latest_ckpt_path
-            else:
-                # At this time, we have to load model init weights and train from step 0.
-                self.load_ckpt_folder = self.load_model_only_folder
-        else:
-            self.load_ckpt_folder = ckpt_config.load_ckpt_folder
+        self.load_ckpt_info = get_config_value(ckpt_config, "load_ckpt_info", None)
+        if self.load_ckpt_info is None:  # (legacy): Try Compatible with old interfaces
+            self.load_ckpt_info = ckpt_info_sanity_check(ckpt_config)

-        if gpc.is_rank_for_log():
-            logger.info(f"load_ckpt_folder will set to :'{self.load_ckpt_folder}'")
-            if self.stop_file_path is None:
-                logger.warning("no set stop_file_path, quit_signal_handler is disable")
+        # Auto-reload latest checkpoint, it will overwrite the setting of 'load_ckpt_info'.
+        self.auto_resume = get_config_value(ckpt_config, "auto_resume", None)
+        if self.auto_resume is None:  # (legacy): Try Compatible with old interfaces
+            self.auto_resume = auto_resume_sanity_check(ckpt_config)
+        if self.auto_resume:
+            self.load_ckpt_info = self.query_lastest_ckpt()
+
+        if self.stop_file_path is None and gpc.is_rank_for_log():
+            logger.warning("no set stop_file_path, quit_signal_handler is disable")
+
+        # convert to internal representation
+        if self.load_ckpt_info:
+            assert (
+                "path" in self.load_ckpt_info
+                and "content" in self.load_ckpt_info
+                and "ckpt_type" in self.load_ckpt_info
+            ), "please set content in ckpt setting, eg: ckpt = dict(path='', content=['model'], ckpt_type='internlm')"
+
+            # replace load_ckpt
+            self.load_ckpt_info["content"] = CheckpointLoadMask(self.load_ckpt_info["content"])
+            self.load_ckpt_info["ckpt_type"] = CheckpointLoadMethod.convet_load_type(self.load_ckpt_info["ckpt_type"])
+
+        # test storage setting is ok.
+        if self.enable_save_ckpt:
+            self.try_ping_storage()

    def quit_signal_handler(self, train_state) -> bool:
        """
@ -428,17 +633,22 @@ class CheckpointManager:
        Returns:
            bool: whether to quit.
        """
-        now_break, now_save_ckpt, save_type = False, False, CheckpointType.NORMAL_CHECKPOINT
+        now_break, now_save_ckpt, save_type = False, False, CheckpointSaveType.NORMAL_CHECKPOINT

        if self.stop_file_path is None:
            return now_break, now_save_ckpt, save_type

-        with open(self.stop_file_path, "a+", encoding="utf-8") as f:
-            fcntl.flock(f, fcntl.LOCK_EX)
-            f.seek(0)
-            msg = f.read()
-            fcntl.flock(f, fcntl.LOCK_UN)
-            action_step = int(msg)
+        with torch.no_grad():
+            action_step_t = torch.zeros((1,), dtype=torch.int64).cuda()
+            if gpc.get_global_rank() == 0:
+                with open(self.stop_file_path, "r+", encoding="utf-8") as f:
+                    f.seek(0)
+                    msg = f.read()
+                    action_step_t.fill_(int(msg))
+
+            torch.distributed.broadcast(action_step_t, src=0)
+            action_step = action_step_t.item()
+            del action_step_t

        if action_step < 0 and abs(action_step) == train_state.step_count:
            now_save_ckpt = True
@ -459,24 +669,29 @@ now step_count is {train_state.step_count}",

        return now_break, now_save_ckpt, save_type

-    def try_save_checkpoint(self, train_state):
-        if not self.enable_save_ckpt:
-            return False
-
-        save_ckpts, save_type = False, CheckpointType.NORMAL_CHECKPOINT
+    def is_now_to_save_ckpt(self, train_state) -> (bool, CheckpointSaveType, bool):
+        save_ckpts, save_type, now_break = False, CheckpointSaveType.NORMAL_CHECKPOINT, False
        if self.oss_snapshot_freq > 1 and train_state.step_count % self.oss_snapshot_freq == 0:
-            save_ckpts, save_type = True, CheckpointType.SNAPSHOT_CHECKPOINT
+            save_ckpts, save_type = True, CheckpointSaveType.SNAPSHOT_CHECKPOINT
        if train_state.step_count % self.checkpoint_every == 0:
-            save_ckpts, save_type = True, CheckpointType.NORMAL_CHECKPOINT
+            save_ckpts, save_type = True, CheckpointSaveType.NORMAL_CHECKPOINT
        now_break, singal_save_ckpts, singal_save_type = self.quit_signal_handler(train_state)
        if save_ckpts is False:
            save_ckpts = singal_save_ckpts
            save_type = singal_save_type

+        return save_ckpts, save_type, now_break
+
+    def try_save_checkpoint(self, train_state):
+        if not self.enable_save_ckpt:
+            return False
+
+        save_ckpts, save_type, now_break = self.is_now_to_save_ckpt(train_state)
+
        if save_ckpts:
            # Wait for the previous round of asynchronous upload storage to complete.
            self.storage_manager.wait()
-            if save_type == CheckpointType.SNAPSHOT_CHECKPOINT:
+            if save_type == CheckpointSaveType.SNAPSHOT_CHECKPOINT:
                # Snapshot number, with only two snapshots written alternately.
                self.snapshot_counter = (self.snapshot_counter + 1) % 2
                save_ckpt_folder = os.path.join(self.snapshot_ckpt_folder, f"{self.snapshot_counter}")
@ -506,51 +721,63 @@ now step_count is {train_state.step_count}",
            Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
        """
        ckpt_list = self.storage_manager.get_fns(self.save_ckpt_folder)
-        if len(ckpt_list) == 0:
+        if ckpt_list is None or len(ckpt_list) == 0:
            return None, None

        max_normal_step = 0
-        ckpt_list = list(map(lambda a: int(a.strip("/")) if a.strip("/").isdigit() else 0, ckpt_list))
-        ckpt_list.sort(reverse=True)
-        for ckpt in ckpt_list:
-            fns_list = self.storage_manager.get_fns(os.path.join(self.save_ckpt_folder, str(ckpt)))
-            for fn in fns_list:
-                if fn.endswith(".step"):
-                    max_normal_step = ckpt
+        # Return ckpt_list look like: ['pings', 'snapshot', '4']
+        # Here we only try to find the ckpt folder named after step, ignoring snapshot and other folders.
+        ckpt_list = [int(fn.strip("/")) for fn in ckpt_list if fn.strip("/").isdigit()]
+        if len(ckpt_list) == 0:
+            logger.warning("Not found avaliable normal checkpoint!")
+        else:
+            logger.info(f"Found avaliable normal checkpoint: {ckpt_list}!")
+            ckpt_list.sort(reverse=True)
+            for ckpt in ckpt_list:
+                fns_list = self.storage_manager.get_fns(os.path.join(self.save_ckpt_folder, str(ckpt)))
+                for fn in fns_list:
+                    if fn.endswith(".step"):
+                        max_normal_step = ckpt
+                        break
+                if max_normal_step != 0:
                    break
-            if max_normal_step != 0:
-                break

-        max_normal_step = ckpt_list[0]
-        load_normal_ckpt_path = os.path.join(self.save_ckpt_folder, str(max_normal_step))
+            max_normal_step = ckpt_list[0]
+            load_normal_ckpt_path = os.path.join(self.save_ckpt_folder, str(max_normal_step))

        snapshot_path_0 = os.path.join(self.save_ckpt_folder, "snapshot", "0")
        snapshot_path_1 = os.path.join(self.save_ckpt_folder, "snapshot", "1")
-        ckpt_list_1 = self.storage_manager.get_fns(snapshot_path_0)
-        ckpt_list_2 = self.storage_manager.get_fns(snapshot_path_1)
-        max_step_0, max_step_1 = 0, 0
-        for ckpt in ckpt_list_1:
-            ckpt = ckpt.strip("/")
-            if ckpt.endswith(".step"):
-                max_step_0 = max(max_step_0, int(ckpt.split(".")[0]))
-        for ckpt in ckpt_list_2:
-            ckpt = ckpt.strip("/")
-            if ckpt.endswith(".step"):
-                max_step_1 = max(max_step_1, int(ckpt.split(".")[0]))
+        ckpt_list_0 = self.storage_manager.get_fns(snapshot_path_0)
+        ckpt_list_1 = self.storage_manager.get_fns(snapshot_path_1)

-        snap_load_path = snapshot_path_0 if max_step_0 > max_step_1 else snapshot_path_1
-        snap_step = max(max_step_0, max_step_1)
-        load_path = snap_load_path if snap_step > max_normal_step else load_normal_ckpt_path
-        load_step = max(snap_step, max_normal_step)
-        return load_path, load_step
+        def found_latest_snapshot(_ckpt_list):
+            _max_step_snapshot = 0
+            if _ckpt_list:
+                for ckpt in _ckpt_list:
+                    ckpt = ckpt.strip("/")
+                    if ckpt.endswith(".step"):
+                        _max_step_snapshot = max(_max_step_snapshot, int(ckpt.split(".")[0]))
+            return _max_step_snapshot
+
+        max_step_0 = found_latest_snapshot(ckpt_list_0)
+        max_step_1 = found_latest_snapshot(ckpt_list_1)
+
+        if sum([max_step_0, max_step_1, max_normal_step]) == 0:
+            return None, None
+        else:
+            snap_load_path = snapshot_path_0 if max_step_0 > max_step_1 else snapshot_path_1
+            snap_step = max(max_step_0, max_step_1)
+            load_path = snap_load_path if snap_step > max_normal_step else load_normal_ckpt_path
+            return load_path, max(snap_step, max_normal_step)

    def query_latest_snapshot_step_local(self):
        max_step, max_step_path = 0, None
-        for root, _, files in os.walk(self.save_ckpt_folder, followlinks=True):
+        save_ckpt_folder = self.save_ckpt_folder.split(":")[1]
+        for root, _, files in os.walk(save_ckpt_folder, followlinks=True):
            for fn in files:
                fn = fn.strip("/")
                if fn.endswith(".step"):
-                    # We assume that both normal ckpt and snapshot ckpt will store the '.step' file
+                    # We assume that both internlm ckpt and snapshot ckpt will store the '.step' file
                    # as an integrity flag.
                    step = int(fn.rsplit(".", maxsplit=1)[0])
                    if max_step < step:
@ -560,100 +787,55 @@ now step_count is {train_state.step_count}",
        return max_step_path, max_step

    def query_lastest_ckpt(self):
-        latest_checkpoint = None
+        latest_ckpt, step = None, -1
        # Training was automatically restarted by the process, forcing the latest snapshot to be read.
        if self.save_ckpt_folder:
-            if self.save_ckpt_folder.startswith("boto3"):
-                latest_checkpoint, step = self.query_latest_snapshot_step_boto3()
-            elif self.save_ckpt_folder.startswith("local"):
-                latest_checkpoint, step = self.query_latest_snapshot_step_local()
-            else:
-                latest_checkpoint, step = None, 0
+            backend, _ = try_get_storage_backend(self.save_ckpt_folder)
+            if backend == "boto3":
+                latest_ckpt, step = self.query_latest_snapshot_step_boto3()
+                if latest_ckpt and not latest_ckpt.startswith("boto3:"):
+                    latest_ckpt = ":".join(["boto3", latest_ckpt])
+            elif backend == "local":
+                latest_ckpt, step = self.query_latest_snapshot_step_local()
+                if latest_ckpt and not latest_ckpt.startswith("local:"):
+                    latest_ckpt = ":".join(["local", latest_ckpt])

-            if latest_checkpoint is not None:
-                if gpc.is_rank_for_log():
-                    logger.info(f"Found latest ckpt : {latest_checkpoint}, step: {step}")
-                    send_alert_message(
-                        address=self.feishu_address,
-                        message=f"Auto restart resume from ckpt-path: '{latest_checkpoint}', step : {step}",
-                    )
-            else:
-                if gpc.is_rank_for_log():
-                    send_alert_message(
-                        address=self.feishu_address,
-                        message=f"Can't find snapshot checkpoint, use default load-ckpt path: {latest_checkpoint}",
-                    )
+        if gpc.is_rank_for_log():
+            logger.info(f"Found latest ckpt {latest_ckpt if latest_ckpt else 'None'}, step: {step}...")

-        return latest_checkpoint
+        return dict(path=latest_ckpt, content=("all",), ckpt_type="internlm")

-    def try_load_model(self, current_time=""):
-        model_load_path = None
+    def try_resume_training(self, train_state: TrainState, current_time=""):

-        if self.load_ckpt_folder and self.load_model_only_folder:
-            raise ValueError(
-                "Error, try to use both load_ckpt_folder and load_model_only_folder paths, \
-if you only need to load model weights (for example starting an SFT task for the first time), \
-set load_model_only_folder path, if you need to resume training from ckpt, \
-set load_ckpt_folder or use default value \
-(if is the default value, internlm will try to load the latest ckpt from save_ckpt_folder)"
-            )
-
-        if self.load_ckpt_folder:
-            if gpc.is_rank_for_log():
-                logger.info(
-                    f"===========Resume training from `{self.load_ckpt_folder}` {current_time} on host:"
-                    f"{socket.gethostname()}==========="
-                )
-            model_load_path = self.load_ckpt_folder
-        elif self.load_model_only_folder:
-            if gpc.is_rank_for_log():
-                logger.info(
-                    f"===========Load Model from `{self.load_model_only_folder}` {current_time} on host:"
-                    f"{socket.gethostname()}==========="
-                )
-            model_load_path = self.load_model_only_folder
-        else:
+        if self.load_ckpt_info is None or self.load_ckpt_info["path"] is None:
            if gpc.is_rank_for_log():
                logger.info(
                    f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
                    f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
                    f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
                )
+        else:
+            load_path = self.load_ckpt_info["path"]
+            load_content = self.load_ckpt_info["content"]
+            load_type = self.load_ckpt_info["ckpt_type"]

-        # Loading model weights must be done before zero is initialized.
-        if model_load_path is not None:
-            load_model_checkpoint(folder=model_load_path, model=self.model)
+            load_func = CheckpointLoadMethod.get_ckpt_load_type_func(load_type)
+            load_content_str = load_func(self, self.load_ckpt_info, train_state)

-    def try_resume_training(self, lr_scheduler, optimizer, lr, train_state, train_dl):
-        """Attempt to restore the training state of the last ckpt.
+            # If we only load model weight, we need rewrite zero optim's fp32 buffer.
+            if load_content.only_load(CheckpointLoadContent.MODEL) and isinstance(self.optimizer, HybridZeroOptimizer):
+                reload_zero_fp32_buff(self.optimizer)

-        Args:
-            lr_scheduler (_LRScheduler): lr_scheduler object.
-            optimizer (Optimizer): optimizer object.
-            lr (float): learning rate.
-            train_state (dict): traing states.
-            train_dl (DataLoader): traning dataloader object
-        """
-        if self.load_ckpt_folder is not None:
-            # load optimzier states.
-            if self.load_optimizer:
-                load_optimizer_checkpoint(self.load_ckpt_folder, optimizer)
-            # load lr scheduler states.
-            load_scheduler(self.load_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
-            # load training states.
-            load_context(self.load_ckpt_folder, train_dl, train_state)
-            # load dataloader sampler states.
-            if hasattr(train_state, "batch_sampler") and not isinstance(
-                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
-            ):
-                load_sampler(self.load_ckpt_folder, train_dl.batch_sampler)
-            if hasattr(train_state, "data_state_dict"):
-                train_dl.dataset.load_state_dict(
-                    llm_load(os.path.join(self.load_ckpt_folder, "sampler_0.pt")), ckpt_path=self.load_ckpt_folder
+            if gpc.is_rank_for_log():
+                logger.info(f"load_ckpt_info : {self.load_ckpt_info}")
+                logger.info(
+                    f"===========Resume training from `{load_path}` {current_time} on host:"
+                    f"{socket.gethostname()}==========="
                )
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
+                if load_content_str:
+                    logger.info(f"===========Load contents are: {load_content_str}")

+    @llm_timeout(func_name="save_checkpoint")
    def save_checkpoint(
        self,
        folder,
@ -694,8 +876,10 @@ set load_ckpt_folder or use default value \
            )

        if gpc.is_rank_for_log():
-            scheduler_states = scheduler.state_dict()
-            llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
+            if scheduler:
+                scheduler_states = scheduler.state_dict()
+                llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
+
            if hasattr(train_state, "batch_sampler") and not isinstance(
                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
            ):
@ -725,3 +909,12 @@ set load_ckpt_folder or use default value \
    def set_save_folder(self, folder, step):
        self.storage_manager.latest_save_folder = folder
        self.storage_manager.latest_save_step = step
+
+    def try_ping_storage(self):
+        if gpc.get_global_rank() % 8 == 0:
+            buff = torch.ones((1, 64, 64), dtype=torch.bfloat16)
+            test_fn = os.path.join(self.save_ckpt_folder, f"pings/{socket.gethostname()}.ping")
+            self.storage_manager.save(test_fn, buff)
+            self.storage_manager.wait()
+            self.storage_manager.load(test_fn)
+            del buff
--- a/internlm/utils/storage_manager.py
+++ b/internlm/utils/storage_manager.py
@ -46,12 +46,12 @@ def get_fns(fp: str):
    return storage_manager.get_fns(fp)


-def llm_load(fp: str, *args, **kwargs):
-    return storage_manager.load(fp, *args, **kwargs)
+def llm_load(fp: str, **kwargs):
+    return storage_manager.load(fp, **kwargs)


-def llm_save(save_path: str, saved_obj: Any, *args, **kwargs):
-    storage_manager.save(save_path, *args, saved_obj=saved_obj, **kwargs)
+def llm_save(save_path: str, saved_obj: Any, **kwargs):
+    storage_manager.save(save_path, to_save_obj=saved_obj, **kwargs)


 class StorageClient:
@ -63,19 +63,23 @@ class StorageClient:
        self.handler = handler

    @staticmethod
-    def load(client, load_path: str, *args, **kwargs):
+    def load(*args, **kwargs):
        raise NotImplementedError

    @staticmethod
-    def sync_upload_fileobj(*args, saved_obj=None, **kwargs):
+    def sync_upload_fileobj(*args, **kwargs):
        raise NotImplementedError

    @staticmethod
-    def assert_fp_exists(client):
+    def async_upload_fileobj(*args, **kwargs):
        raise NotImplementedError

    @staticmethod
-    def get_fns(client):
+    def assert_fp_exists(*args, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_fns(*args, **kwargs):
        raise NotImplementedError


@ -92,40 +96,65 @@ class Boto3MetaInfo:
        async_upload_fn: callable,
        local_nvme_path=None,
    ) -> None:
-        self.is_async = is_async
+        # all need info.
        self.client = handler
        self.bucket_name = bucket_name
-        self.endpoint = endpoint
        self.file_path = file_path
-        self.async_upload_fn = async_upload_fn
+        # only save need info.
        self.local_nvme_path = local_nvme_path
+        self.is_async = is_async
+        self.endpoint = endpoint
+        self.async_upload_fn = async_upload_fn

    def __str__(self) -> str:
        return f"is_async: {self.is_async}, bucket_name:{self.bucket_name}, endpoint:{self.endpoint}, \
 local_nvme_path: {self.local_nvme_path}"

+    @staticmethod
+    def unpack_boto3_save_meta(meta):
+        if meta.is_async:
+            return meta.client, meta.bucket_name, meta.file_path, meta.local_nvme_path
+        else:
+            return meta.client, meta.bucket_name, meta.file_path
+
+    @staticmethod
+    def unpack_boto3_nosave_meta(meta):
+        return meta.client, meta.bucket_name, meta.file_path
+

 class LocalMetaInfo:
    """Local meta info for save/load etc."""

-    def __init__(self, handler: StorageClient, dest_path: str) -> None:
-        self.is_async = False
-        self.client = handler
-        self.dest_path = dest_path
+    def __init__(self, file_path: str) -> None:
+        self.file_path = file_path
        self.async_upload_fn = None
+        self.is_async = False
+
+    @staticmethod
+    def unpack_local_save_meta(meta):
+        return (meta.file_path,)
+
+    @staticmethod
+    def unpack_local_nosave_meta(meta):
+        return (meta.file_path,)


-def unpack_meta(meta):
-    args = []
-    is_async = meta.is_async
-    for k, v in meta.__dict__.items():
-        if k in ("endpoint", "async_upload_fn", "is_async"):
-            continue
-        if not is_async and k in ("local_nvme_path",):
-            continue
-        args.append(v)
+def unpack_save_meta(meta: Union[Boto3MetaInfo, LocalMetaInfo]):
+    if isinstance(meta, Boto3MetaInfo):
+        return Boto3MetaInfo.unpack_boto3_save_meta(meta)
+    elif isinstance(meta, LocalMetaInfo):
+        return LocalMetaInfo.unpack_local_save_meta(meta)
+    else:
+        raise ValueError(f"unkonwn meta info: {type(meta)}")

-    return args
+
+def unpack_nosave_meta(meta: Union[Boto3MetaInfo, LocalMetaInfo]):
+    if isinstance(meta, Boto3MetaInfo):
+        return Boto3MetaInfo.unpack_boto3_nosave_meta(meta)
+    elif isinstance(meta, LocalMetaInfo):
+        return LocalMetaInfo.unpack_local_nosave_meta(meta)
+    else:
+        raise ValueError(f"unkonwn meta info: {type(meta)}")


 def compute_file_md5_by_chunk(file_name: str):
@ -136,6 +165,22 @@ def compute_file_md5_by_chunk(file_name: str):
    return hash_md5.hexdigest()


+def try_get_storage_backend(path: str):
+    sre = path.split(":", maxsplit=1)
+    if len(sre) == 1:
+        if path.startswith("s3:"):
+            backend = "boto3"
+            if gpc.is_rank_for_log():
+                logger.warning(f"path: '{path}' not start with backend prefix, guess it is the backend of boto3.")
+        else:
+            backend = "local"
+            if gpc.is_rank_for_log():
+                logger.warning(f"path: '{path}' not start with backend prefix, guess it is the backend of local.")
+        return backend, sre
+    else:
+        return sre[0], sre[1]  # (backend_prefix, splited_path)
+
+
 class Boto3Client(StorageClient):
    """
    Boto3Client
@ -189,13 +234,11 @@ class Boto3Client(StorageClient):
        )

    @staticmethod
-    def sync_upload_fileobj(
-        handler, bucket_name: str, fp: str, local_nvme_path: str, *args, saved_obj=None, **kwargs
-    ):  # pylint: disable=W0613
+    def sync_upload_fileobj(handler, bucket_name: str, fp: str, saved_obj=None, **kwargs):
        assert saved_obj is not None, "saved_obj is None!"
        try:
            with io.BytesIO() as f:
-                torch.save(saved_obj, f, *args, **kwargs)
+                torch.save(saved_obj, f, **kwargs)
                f.seek(0)
                handler.client.upload_fileobj(f, bucket_name, fp, Config=handler.config)
        except handler.botocore.exceptions.EndpointConnectionError as exc:
@ -204,14 +247,7 @@ class Boto3Client(StorageClient):
            ) from exc

    @staticmethod
-    def load(
-        handler,
-        bucket_name: str,
-        fp: str,
-        local_nvme_path: str,  # pylint: disable=W0613
-        *args,
-        **kwargs,
-    ) -> Dict:
+    def load(handler, bucket_name: str, fp: str, **kwargs) -> Dict:
        """
        Args:
            fp (str): Path to save, eg. s3://opennlplab/model_weights/xxx/ddd.pt
@ -220,7 +256,7 @@ class Boto3Client(StorageClient):
            with io.BytesIO() as f:
                handler.client.download_fileobj(bucket_name, fp, f, Config=handler.config)
                f.seek(0)
-                states = torch.load(f, *args, **kwargs)
+                states = torch.load(f, **kwargs)
        except handler.botocore.exceptions.EndpointConnectionError as exc:
            raise RuntimeError(
                f"Boto3 Network Error: Please Check your Internet Connection in {socket.gethostname()}"
@ -228,24 +264,37 @@ class Boto3Client(StorageClient):
        return states

    @staticmethod
-    def assert_fp_exists(handler, bucket_name: str, fp: str, local_nvme_path: str):  # pylint: disable=W0613
+    def assert_fp_exists(handler, bucket_name: str, fp: str):  # pylint: disable=W0613
        assert len(list(handler.client.list_objects(Bucket=bucket_name, Prefix=fp)["Contents"])) > 0, fp

    @staticmethod
-    def get_fns(handler, bucket_name: str, fp: str, local_nvme_path: str, *args, **kwargs):  # pylint: disable=W0613
+    def is_fp_exists(handler, bucket_name: str, fp: str):  # pylint: disable=W0613
+        re = handler.client.list_objects(Bucket=bucket_name, Prefix=fp)
+        if "Contents" in re:
+            return len(list(re["Contents"])) > 0
+        else:
+            return False
+
+    @staticmethod
+    def get_fns(handler, bucket_name: str, fp: str):
        """
        Ref: https://stackoverflow.com/questions/54314563/
        how-to-get-more-than-1000-objects-from-s3-by-using-list-objects-v2
        """
-        paginator = handler.client.get_paginator("list_objects_v2")
-        pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
-        folder_name_list = []
-        for page in pages:
-            if "Contents" in page:
-                for obj in page["Contents"]:
-                    pth: str = obj["Key"]
-                    folder_name_list.append(pth.split(fp, maxsplit=1)[1].strip("/").split("/", maxsplit=1)[0])
-        return list(set(folder_name_list))
+        if Boto3Client.is_fp_exists(handler, bucket_name, fp):
+            paginator = handler.client.get_paginator("list_objects_v2")
+            pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
+            folder_name_list = []
+            for page in pages:
+                if "Contents" in page:
+                    for obj in page["Contents"]:
+                        pth: str = obj["Key"]
+                        folder_name_list.append(pth.split(fp, maxsplit=1)[1].strip("/").split("/", maxsplit=1)[0])
+            return list(set(folder_name_list))
+        else:
+            if gpc.is_rank_for_log():
+                logger.warning(f"'{fp}' not found!")
+            return None

    @staticmethod
    def async_upload_fileobj(handler, bucket_name: str, fp: str, local_nvme_path: str):
@ -273,37 +322,35 @@ class LocalClient(StorageClient):
        super().__init__(None)

    @staticmethod
-    def sync_upload_fileobj(handler, fp: str, *args, saved_obj=None, **kwargs):
-        assert isinstance(handler, LocalClient)
+    def sync_upload_fileobj(fp: str, saved_obj=None, **kwargs):
        assert saved_obj is not None
        fp_dirname = os.path.dirname(fp)
        if not os.path.exists(fp_dirname):
            os.makedirs(fp_dirname, exist_ok=True)
-        torch.save(saved_obj, fp, *args, **kwargs)
+        torch.save(saved_obj, fp, **kwargs)

    @staticmethod
-    def load(handler, fp: str, *args, **kwargs):  # pylint: disable=W0613
-        assert isinstance(handler, LocalClient)
-        assert os.path.exists(fp), f"{fp} is not found!"
-        with open(fp, "rb") as f:
-            states = torch.load(f, *args, **kwargs)
+    def load(load_path: str, **kwargs):
+        assert os.path.exists(load_path), f"{load_path} is not found!"
+        with open(load_path, "rb") as f:
+            states = torch.load(f, **kwargs)
        return states

    @staticmethod
-    def assert_fp_exists(handler, folder):
-        assert isinstance(handler, LocalClient)
+    def assert_fp_exists(folder):
        assert os.path.exists(folder), folder

    @staticmethod
-    def get_fns(handler, folder):
-        assert isinstance(handler, LocalClient)
-        assert os.path.exists(folder), f"folder '{folder}' not exists!"
-        fns = os.listdir(folder)
-        return fns
+    def get_fns(folder):
+        if not os.path.exists(folder):
+            if gpc.is_rank_for_log():
+                logger.warning(f"'{folder}' not found!")
+            return None
+        else:
+            return os.listdir(folder)

    @staticmethod
-    def delete_obj(handler, fp: str):
-        assert isinstance(handler, LocalClient)
+    def delete_obj(fp: str):
        if not os.path.isdir(fp):
            os.remove(fp)

@ -327,7 +374,10 @@ def get_boto3_meta(fp: str, tmp_local_folder: str, is_async: bool) -> Boto3MetaI
    assert match is not None, f"url '{fp}' is not a valid boto3 url"
    bucket_name, endpoint = match.group(1), match.group(2)
    endpoint = "http://" + endpoint + ":80"
-    tmp_step_file = get_tmp_file_name(tmp_local_folder, fp)
+    if is_async:
+        tmp_step_file = get_tmp_file_name(tmp_local_folder, fp)
+    else:
+        tmp_step_file = None
    return Boto3MetaInfo(
        is_async=is_async,
        handler=None,
@ -341,7 +391,7 @@ def get_boto3_meta(fp: str, tmp_local_folder: str, is_async: bool) -> Boto3MetaI

 def get_local_meta(fp: str) -> LocalMetaInfo:
    assert not fp.startswith("s3://"), f"Path '{fp}' is not a local path"
-    return LocalMetaInfo(None, fp)
+    return LocalMetaInfo(fp)


 def get_mount_point_free_size(path: str):
@ -427,7 +477,7 @@ class StorageManager(metaclass=SingletonMeta):
                logger.error(f'tmp_local_folder only have "{free_size}" GB free space, less then 100 GB!')
                raise RuntimeError(f"Insufficient temporary storage space on {socket.gethostname()}")

-    def _get_client(self, path=str) -> Union[Boto3MetaInfo, LocalMetaInfo]:
+    def _get_client(self, path: str, async_mode: bool = False) -> Union[Boto3MetaInfo, LocalMetaInfo]:
        """
        example:
        local:/path/to/checkpoint
@ -436,17 +486,14 @@ class StorageManager(metaclass=SingletonMeta):
        Args:
            path (str): _description_
        """
-        try:
-            backend, path = path.split(":", maxsplit=1)
-        except Exception as exc:
-            raise AttributeError(f"Given path '{path}' is not startwith backend prefix:'local/boto3'") from exc
+        backend, path = try_get_storage_backend(path)

        init_args = (None,)
        if backend == "local":
            meta_info = get_local_meta(path)
            backend_key = backend
        elif backend == "boto3":
-            meta_info = get_boto3_meta(path, self.tmp_local_folder, self.async_mode)
+            meta_info = get_boto3_meta(path, self.tmp_local_folder, async_mode)
            backend_key = backend + ":" + meta_info.endpoint
            init_args = (meta_info.endpoint,)
            if (
@ -474,17 +521,22 @@ class StorageManager(metaclass=SingletonMeta):

    def assert_fp_exists(self, folder) -> None:
        meta = self._get_client(path=folder)
-        meta.client.assert_fp_exists(*unpack_meta(meta))
+        meta.client.assert_fp_exists(*unpack_nosave_meta(meta))

    def get_fns(self, folder) -> List[str]:
        meta = self._get_client(path=folder)
-        return meta.client.get_fns(*unpack_meta(meta))
+        return meta.client.get_fns(*unpack_nosave_meta(meta))

-    def save(self, save_path: str, saved_obj: Any, *args, async_upload=None, **kwargs):
-        meta = self._get_client(path=save_path)
+    def save(self, save_path: str, to_save_obj: Any, async_upload=None, **kwargs):

        if async_upload is None:
            async_upload = self.async_mode
+
+        if not save_path.startswith("boto3:"):
+            async_upload = False
+
+        meta = self._get_client(save_path, async_upload)
+
        if async_upload:
            assert (
                self.tmp_local_folder
@ -492,22 +544,22 @@ class StorageManager(metaclass=SingletonMeta):
            tmp_step_file = meta.local_nvme_path
            self._to_be_del_files.append(tmp_step_file)
            with open(tmp_step_file, "wb") as f:
-                torch.save(saved_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
-            self.async_executor(meta.async_upload_fn, *unpack_meta(meta))
+                torch.save(to_save_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
+            self.async_executor(meta.async_upload_fn, *unpack_save_meta(meta))
            os.chmod(tmp_step_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
            self.async_task_peeding = True
        else:
-            meta.client.sync_upload_fileobj(*unpack_meta(meta), *args, saved_obj=saved_obj, **kwargs)
+            meta.client.sync_upload_fileobj(*unpack_save_meta(meta), saved_obj=to_save_obj, **kwargs)
            self.upload_count += 1

-    def load(self, load_path: str, *args, **kwargs) -> Any:
+    def load(self, load_path: str, **kwargs) -> Any:
        self.wait()
        meta = self._get_client(path=load_path)
-        return meta.client.load(*unpack_meta(meta), *args, **kwargs)
+        return meta.client.load(*unpack_nosave_meta(meta), **kwargs)

    def delete_obj(self, fp: str):
        meta = self._get_client(path=fp)
-        meta.client.delete_obj(*unpack_meta(meta))
+        meta.client.delete_obj(*unpack_nosave_meta(meta))

    def _del_tmp_folder(self):
        for fp in self._to_be_del_files:
@ -594,23 +646,24 @@ class StorageManager(metaclass=SingletonMeta):

        if gpc.is_rank_for_log():
            self.upload_count += 1
-            if self.async_mode:
+            if self.async_mode and self.latest_save_folder:
                self.save(
                    os.path.join(self.latest_save_folder, f"{self.latest_save_step}.step"),
-                    saved_obj=dict({"step": self.latest_save_step}),
+                    to_save_obj=dict({"step": self.latest_save_step}),
                    async_upload=False,
                )
+                self.latest_save_folder = None


 storage_manager: StorageManager = None


-def init_storage_manager(ckpt_config):
+def init_storage_manager(enable_save_ckpt, async_upload_tmp_folder, async_upload):
    global storage_manager
    storage_manager = StorageManager(
-        ckpt_config.enable_save_ckpt,
-        tmp_local_folder=ckpt_config.async_upload_tmp_folder,
-        async_mode=ckpt_config.async_upload,
+        enable_save_ckpt,
+        tmp_local_folder=async_upload_tmp_folder,
+        async_mode=async_upload,
    )


--- a/internlm/utils/timeout.py
+++ b/internlm/utils/timeout.py
@ -1,4 +1,13 @@
+import datetime
+import os
 import signal
+import socket
+import traceback
+from functools import wraps
+
+from internlm.utils.logger import get_logger
+
+logger = get_logger(__file__)


 class Timeout:
@ -24,3 +33,81 @@ class Timeout:

    def __exit__(self, error_type, value, traceback):
        signal.alarm(0)
+
+
+ENABLE_TIMEOUT = os.getenv("INTERNLM_ENABLE_TIMEOUT", None)
+
+
+timeout_threshold_dict = {
+    "initialize_distributed_env": 120,
+    "nopp_forward_backward_step": 360,
+    "initialize_model": 10,
+    "initialize_optimizer": 20,
+    "optim_step": 30,
+    "get_train_data_loader": 600,
+    "get_validation_data_loader": 60,
+    "load_new_batch": 10,
+    "record_current_batch_training_metrics": 10,
+    "save_checkpoint": 1200,
+    "interleaved_forward_backward_step": 600,
+    "nointerleaved_forward_backward_step": 600,
+}
+
+if ENABLE_TIMEOUT is not None:
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "1"
+    LLM_NCCL_TIMEOUT = datetime.timedelta(seconds=int(os.getenv("NCCL_TIMEOUT", str(60))))
+else:
+    timeout_threshold_dict = dict.fromkeys(timeout_threshold_dict.keys(), 0)
+    LLM_NCCL_TIMEOUT = datetime.timedelta(seconds=1800)
+
+
+def try_get_gpc_rank():
+    try:
+        from internlm.core.context import global_context as gpc
+
+        rank = gpc.get_global_rank()
+    except:  # noqa  # pylint: disable=bare-except
+        rank = "unknown"
+
+    return f"host-{socket.gethostname()}-rank-{rank}"
+
+
+def llm_timeout(seconds=0, func_name=None):
+    """timeout decorator, Note that this decorator cannot be reentrant,
+    otherwise the signal will be reset.
+
+    Args:
+        seconds (int, optional): timeout threshold. Defaults to 300.
+        func_name (str, optional): the func who is been waited to timeout.
+    """
+
+    def decorator(func):
+        nonlocal func_name
+        if func_name is None:
+            func_name = func.__name__
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            def _handle_timeout(signum, frame):
+                raise TimeoutError
+
+            nonlocal seconds
+            seconds = timeout_threshold_dict.get(func_name, seconds)
+
+            if seconds > 0:
+                signal.signal(signal.SIGALRM, _handle_timeout)
+                signal.alarm(seconds)
+
+            try:
+                result = func(*args, **kwargs)
+            except TimeoutError as e:
+                logger.error(f"TimeoutError at {try_get_gpc_rank()}: {func_name}\\n {traceback.format_exc()}")
+                raise e
+            finally:
+                signal.alarm(0)
+
+            return result
+
+        return wrapper
+
+    return decorator
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -13,4 +13,4 @@ boto3
 botocore
 torch-scatter
 pyecharts
-f https://data.pyg.org/whl/torch-1.13.0+cu117.html
+-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_utils/common_fixture.py
+++ b/tests/test_utils/common_fixture.py
@ -0,0 +1,183 @@
+import os
+import shutil
+from subprocess import PIPE, STDOUT, Popen
+
+import pytest
+import torch
+
+from internlm.core.context import global_context as gpc
+from internlm.core.context.parallel_context import Config
+from internlm.solver.optimizer.hybrid_zero_optim import HybridZeroOptimizer
+from internlm.utils.common import SingletonMeta
+
+OSS_NAME = os.environ["OSS_BUCKET_NAME"]
+OSS_IP = os.environ["OSS_IP"]
+USER = os.environ["USER"]
+JOB_NAME = "CI_TEST"
+LOCAL_SAVE_PATH = "local:local_ckpt"
+
+BOTO_SAVE_PATH = f"boto3:s3://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
+BOTO_SAVE_PATH_NO_PRFIX = f"s3://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
+
+ASYNC_TMP_FOLDER = "./async_tmp_folder"
+
+
+# 1B
+init_config = Config(
+    dict(
+        parallel=dict(zero1=1, pipeline=dict(size=1, interleaved_overlap=False), sequence_parallel=False, tensor=1),
+        model_type="INTERNLM",
+        adam=dict(
+            lr=1e-4,
+        ),
+        data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999),
+        model=dict(
+            checkpoint=False,
+            num_attention_heads=2,
+            embed_split_hidden=True,
+            vocab_size=103168,
+            embed_grad_scale=1,
+            parallel_output=True,
+            hidden_size=1024,
+            num_layers=2,
+            mlp_ratio=1,
+            apply_post_layer_norm=False,
+            dtype=torch.bfloat16,
+            norm_type="rmsnorm",
+            layer_norm_epsilon=1e-5,
+            use_flash_attn=True,
+            num_chunks=1,
+        ),
+        resume_tb_folder="",
+        tensorboard_folder="",
+        alert_address=None,
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+    )
+)
+
+
+def init_naive_model():
+    # let MODEL_INITIALIZER to work
+    import internlm.model.modeling_internlm  # noqa # pylint: disable=unused-import
+    from internlm.core.naive_amp import NaiveAMPModel
+    from internlm.utils.registry import MODEL_INITIALIZER
+
+    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(init_config.model))
+    model = NaiveAMPModel(
+        model=model,
+        output_to_fp32=False,
+        dtype=torch.bfloat16,
+        sync_buffer=False,
+    )
+    return model
+
+
+def init_naive_optim(model):
+    naive_optimizer = torch.optim.AdamW(
+        params=[{"params": model.parameters(), "weight_decay": 0.01}],
+        lr=1e-4,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+    )
+    return naive_optimizer
+
+
+def init_hybrid_optim(model):
+    naive_optimizer = torch.optim.AdamW(
+        params=[{"params": model.parameters(), "weight_decay": 0.01}],
+        lr=1e-4,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+    )
+    optimizer = HybridZeroOptimizer(
+        naive_optimizer,
+        grad_scal_cfg=Config(
+            dict(
+                fp16=dict(
+                    initial_scale=2**16,
+                    min_scale=1,
+                    growth_interval=1000,
+                ),
+                growth_factor=2,
+                backoff_factor=0.5,
+                max_scale=2**24,
+                hysteresis=2,
+            )
+        ),
+        zero_cfg=Config(
+            dict(
+                overlap_sync_grad=False,
+                overlap_sync_param=False,
+                reduce_bucket_size=512 * 1024 * 1024,
+                clip_grad_norm=1.0,
+            )
+        ),
+        param_bcast_sync_handler=None,
+    )
+    return optimizer
+
+
+@pytest.fixture(autouse=True, scope="function")
+def reset_singletons():
+    SingletonMeta._instances = {}
+
+
+def reset_seed():
+    from internlm.core.context.random import _SEED_MANAGER
+
+    _SEED_MANAGER.reset()
+
+
+@pytest.fixture(scope="module")
+def init_dist_and_model(rank=0, world_size=1):
+    from internlm.initialize import initialize_distributed_env
+
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12377"
+    initialize_distributed_env(config=init_config, launcher="torch", master_port=12377, args_check=False)
+
+    # setup
+    print("set up", flush=True)
+    model = init_naive_model()
+    # opim = init_naive_optim(model)
+    opim = init_hybrid_optim(model)
+
+    yield model, opim
+
+    # teardown
+    del model, opim
+    print("teardown", flush=True)
+    gpc.destroy()
+    reset_seed()
+
+
+def enter_flag(text):
+    print(f"{text} begin!", flush=True)
+    yield
+    print(f"{text} end!", flush=True)
+
+
+def del_tmp_file():
+    try:
+        shutil.rmtree(ASYNC_TMP_FOLDER, ignore_errors=True)
+    except FileNotFoundError:
+        pass
+
+    try:
+        shutil.rmtree(LOCAL_SAVE_PATH.split(":")[1], ignore_errors=True)
+    except FileNotFoundError:
+        pass
+
+    try:
+        cmd = r"/mnt/petrelfs/share/sensesync --dryrun --deleteSrc cp " + BOTO_SAVE_PATH_NO_PRFIX + " / "
+        with Popen(cmd, stdout=PIPE, stderr=STDOUT, shell=True) as output:
+            results, presults = "", ""
+            for line in iter(output.stdout.readline, b""):
+                results += str(line.rstrip())
+                presults += line.rstrip().decode() + "\n"
+        print(presults, flush=True)
+    except:  # noqa # pylint: disable=bare-except
+        pass
--- a/tests/test_utils/test_model_checkpoint.py
+++ b/tests/test_utils/test_model_checkpoint.py
@ -0,0 +1,358 @@
+import os
+from functools import partial
+
+import pytest
+import torch
+import torch.distributed as dist
+
+from internlm.core.context.parallel_context import Config
+from internlm.core.trainer import TrainState
+from internlm.solver.optimizer.hybrid_zero_optim import HybridZeroOptimizer
+from internlm.utils.common import SingletonMeta
+from internlm.utils.model_checkpoint import CheckpointManager
+from internlm.utils.storage_manager import wait_async_upload_finish
+from tests.test_utils.common_fixture import (  # noqa # pylint: disable=unused-import
+    ASYNC_TMP_FOLDER,
+    BOTO_SAVE_PATH,
+    LOCAL_SAVE_PATH,
+    del_tmp_file,
+    init_config,
+    init_dist_and_model,
+    reset_singletons,
+)
+
+# (TOTAL_STEP, CKPT_EVERY, SNPASHOT_EVERY)
+step_info_list = [(8, 4, 2), (3, 4, 2), (1, 6, 3)]
+ckpt_config_list = [
+    # Old interface format
+    dict(
+        enable_save_ckpt=True,
+        save_ckpt_folder=BOTO_SAVE_PATH,
+        load_optimizer=True,
+        checkpoint_every=0,
+        async_upload=True,
+        async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+        snapshot_ckpt_folder="/".join([BOTO_SAVE_PATH, "snapshot"]),
+        oss_snapshot_freq=0,
+        stop_file_path=None,
+        load_model_only_folder=None,
+        load_given_ckpt=False,
+        load_ckpt_folder=None,
+        is_old_api=True,
+    ),
+    # Old interface format
+    dict(
+        enable_save_ckpt=True,
+        save_ckpt_folder=LOCAL_SAVE_PATH,
+        load_optimizer=True,
+        checkpoint_every=0,
+        async_upload=False,
+        async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+        snapshot_ckpt_folder="/".join([LOCAL_SAVE_PATH, "snapshot"]),
+        oss_snapshot_freq=0,
+        stop_file_path=None,
+        load_model_only_folder=None,
+        load_given_ckpt=False,
+        load_ckpt_folder=None,
+        is_old_api=True,
+    ),
+    # New interface format
+    dict(
+        enable_save_ckpt=True,
+        save_ckpt_folder=BOTO_SAVE_PATH,
+        checkpoint_every=0,
+        async_upload=True,
+        async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+        oss_snapshot_freq=0,
+        stop_file_path=None,
+        is_old_api=False,
+        auto_resume=True,
+    ),
+    dict(
+        enable_save_ckpt=True,
+        save_ckpt_folder=LOCAL_SAVE_PATH,
+        checkpoint_every=0,
+        async_upload=False,
+        async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+        oss_snapshot_freq=0,
+        stop_file_path=None,
+        load_ckpt_folder=None,
+        is_old_api=False,
+        auto_resume=True,
+    ),
+]
+
+
+def overwrite_optim_state(optim, set_value):
+    if isinstance(optim, HybridZeroOptimizer):
+        for group_id, p in optim._fp32_flat_param_groups_of_current_rank.items():
+            if optim._zero_local_rank not in optim.param_group_no_params_ranks[group_id]:
+                # p.copy_(torch.full_like(p, set_value, dtype=p.dtype))
+                p.data.fill_(set_value)
+        for group_id in range(len(optim._fp16_param_groups)):
+            if optim._zero_local_rank not in optim.param_group_no_params_ranks[group_id]:
+                fp16_p = optim._param_store.get_flat_fp16_param_by_rank_group(
+                    rank=optim._zero_local_rank, group_id=group_id
+                )
+                fp16_p.fill_(set_value)
+    else:
+        for group in optim.param_groups:
+            for p in group["params"]:
+                # p.copy_(torch.full_like(p, set_value, dtype=p.dtype))
+                p.data.fill_(set_value)
+
+
+def compare_optim_state(optim1, optim2):
+    re = True
+    if isinstance(optim1, HybridZeroOptimizer):
+        fp32_buff1 = optim1._fp32_flat_param_groups_of_current_rank
+        fp32_buff2 = optim2._fp32_flat_param_groups_of_current_rank
+        for group_id_1, group_id_2 in zip(fp32_buff1, fp32_buff2):
+            re &= group_id_1 == group_id_2
+            if optim1.zero_local_rank not in optim1.param_group_no_params_ranks[group_id_1]:
+                re &= torch.equal(fp32_buff1[group_id_1], fp32_buff1[group_id_2])
+    else:
+        for group1, group2 in zip(optim1.param_groups, optim2.param_groups):
+            for p1, p2 in zip(group1["params"], group2["params"]):
+                re &= torch.equal(p1, p2)
+    return re
+
+
+def compare_optim_value(optim, value):
+    re = True
+    if isinstance(optim, HybridZeroOptimizer):
+        for group_id, p in optim._fp32_flat_param_groups_of_current_rank.items():
+            if optim._zero_local_rank not in optim.param_group_no_params_ranks[group_id]:
+                re &= torch.equal(p, torch.full_like(p, value, dtype=p.dtype))
+        for group_id in range(len(optim._fp16_param_groups)):
+            if optim._zero_local_rank not in optim.param_group_no_params_ranks[group_id]:
+                fp16_p = optim._param_store.get_flat_fp16_param_by_rank_group(
+                    rank=optim._zero_local_rank, group_id=group_id
+                )
+                re &= torch.equal(fp16_p, torch.full_like(fp16_p, value, dtype=fp16_p.dtype))
+    else:
+        for group in optim.param_groups:
+            for p in group["params"]:
+                re &= torch.equal(p, torch.full_like(p, value, dtype=p.dtype))
+    return re
+
+
+def overwrite_model_value(model, value):
+    for p in model.parameters():
+        # p.copy_(torch.full_like(p, value, dtype=p.dtype))
+        p.data.fill_(value)
+
+
+def compare_model_value(model, value):
+    re = True
+    for p in model.parameters():
+        re &= torch.equal(p, torch.full_like(p, value, dtype=p.dtype))
+    return re
+
+
+@pytest.fixture(scope="function")
+def del_tmp():
+    del_tmp_file()
+    yield
+    del_tmp_file()
+
+
+def return_prefix_path(save_ckpt_folder):
+    if save_ckpt_folder.startswith("local:"):
+        return LOCAL_SAVE_PATH
+    else:
+        return BOTO_SAVE_PATH
+
+
+def return_latest_save_path(save_ckpt_folder, total_step, snapshot_freq, ckpt_freq):
+
+    snapshot_latest_step, normal_latest_step = 0, 0
+    snapshot_latest_count, normal_latest_count = 0, 0
+
+    for i in range(total_step):
+        if (i + 1) % ckpt_freq == 0:
+            normal_latest_step = i + 1
+            normal_latest_count += 1
+        else:
+            if (i + 1) % snapshot_freq == 0:
+                snapshot_latest_step = i + 1
+                snapshot_latest_count += 1
+
+    if snapshot_latest_step == 0:
+        return None, None
+
+    if normal_latest_step >= snapshot_latest_step:
+        return normal_latest_step, os.path.join(return_prefix_path(save_ckpt_folder), f"{normal_latest_step}")
+    elif normal_latest_step < snapshot_latest_step:
+        if snapshot_latest_count % 2 == 0:
+            re_path = f"{return_prefix_path(save_ckpt_folder)}/snapshot/0"
+        else:
+            re_path = f"{return_prefix_path(save_ckpt_folder)}/snapshot/1"
+        return snapshot_latest_step, re_path
+    else:
+        assert False
+
+
+@pytest.mark.usefixtures("del_tmp")
+@pytest.mark.usefixtures("reset_singletons")
+@pytest.mark.parametrize("step_info", step_info_list)
+@pytest.mark.parametrize("ckpt_config", ckpt_config_list)
+def test_ckpt_mm(step_info, ckpt_config, init_dist_and_model):  # noqa # pylint: disable=unused-import
+    from internlm.core.context import global_context as gpc
+    from internlm.utils.model_checkpoint import CheckpointLoadMask, CheckpointLoadType
+
+    ckpt_config = Config(ckpt_config)
+    total_step, checkpoint_every, oss_snapshot_freq = step_info
+    print(total_step, checkpoint_every, oss_snapshot_freq, flush=True)
+    ckpt_config.checkpoint_every = checkpoint_every
+    ckpt_config.oss_snapshot_freq = oss_snapshot_freq
+
+    bond_return_latest_save_path = partial(
+        return_latest_save_path,
+        ckpt_config.save_ckpt_folder,
+        total_step,
+        ckpt_config.oss_snapshot_freq,
+        ckpt_config.checkpoint_every,
+    )
+
+    model, opim = init_dist_and_model
+    train_state = TrainState(gpc.config, None)
+    if isinstance(opim, HybridZeroOptimizer):
+        print("Is HybridZeroOptimizer!", flush=True)
+    else:
+        print("Is naive Adam!", flush=True)
+
+    ckpt_mm = CheckpointManager(ckpt_config, model=model, optimizer=opim)
+    latest_ckpt_step = None
+    for i in range(total_step):
+        overwrite_model_value(model, i)
+        overwrite_optim_state(opim, i)
+
+        train_state.batch_count = i
+        train_state.step_count += 1
+
+        save_ckpts, _, _ = ckpt_mm.is_now_to_save_ckpt(train_state)
+        if save_ckpts:
+            latest_ckpt_step = i
+
+        ckpt_mm.try_save_checkpoint(train_state)
+
+    wait_async_upload_finish()
+    latest_ckpt_info = ckpt_mm.query_lastest_ckpt()
+    step, path = bond_return_latest_save_path()
+    assert latest_ckpt_info["path"] == path
+    if latest_ckpt_step is None:
+        assert latest_ckpt_step == step
+    else:
+        assert latest_ckpt_step == step - 1
+
+    # resume from before save skpt
+    del ckpt_mm
+    SingletonMeta._instances = {}
+    ckpt_mm = CheckpointManager(ckpt_config, model=model, optimizer=opim)
+    ckpt_mm.try_resume_training(train_state)
+
+    if ckpt_config.checkpoint_every < total_step:
+        # we use step_count to decide when save ckpt, os here latest_ckpt_step = step_count - 1
+        assert train_state.step_count == latest_ckpt_step + 1
+        assert train_state.batch_count == latest_ckpt_step + 1
+        assert compare_optim_value(ckpt_mm.optimizer, latest_ckpt_step), ckpt_mm.optimizer.param_groups[0]["params"][0]
+        assert compare_model_value(ckpt_mm.model, latest_ckpt_step), list(ckpt_mm.model.parameters())[0][0]
+
+        if ckpt_mm.save_ckpt_folder.startswith("local:"):
+            ckpt_mm.load_ckpt_info = dict(
+                path=os.path.join(LOCAL_SAVE_PATH, f"{ckpt_config.checkpoint_every}"),
+                content=CheckpointLoadMask(("all",)),
+                ckpt_type=CheckpointLoadType.INTERNLM,
+            )
+        else:
+            ckpt_mm.load_ckpt_info = dict(
+                path=os.path.join(BOTO_SAVE_PATH, f"{ckpt_config.checkpoint_every}"),
+                content=CheckpointLoadMask(("all",)),
+                ckpt_type=CheckpointLoadType.INTERNLM,
+            )
+
+        ckpt_mm.try_resume_training(train_state)
+
+        assert train_state.step_count == ckpt_config.checkpoint_every
+        assert train_state.batch_count == ckpt_config.checkpoint_every
+        # compare value is same with i.
+        assert compare_optim_value(ckpt_mm.optimizer, ckpt_config.checkpoint_every - 1), ckpt_mm.optimizer.param_groups[
+            0
+        ]["params"][0]
+        assert compare_model_value(ckpt_mm.model, ckpt_config.checkpoint_every - 1), list(ckpt_mm.model.parameters())[
+            0
+        ][0]
+    else:
+        pass
+
+
+STOP_FILE_PATH = "./alter.log"
+
+
+def query_quit_file(rank, world_size=2):
+    from internlm.core.context import global_context as gpc
+    from internlm.initialize import initialize_distributed_env
+    from internlm.utils.model_checkpoint import CheckpointSaveType
+
+    ckpt_config = Config(
+        dict(
+            enable_save_ckpt=True,
+            save_ckpt_folder=BOTO_SAVE_PATH,
+            load_optimizer=True,
+            checkpoint_every=0,
+            async_upload=True,
+            async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+            snapshot_ckpt_folder="/".join([BOTO_SAVE_PATH, "snapshot"]),
+            oss_snapshot_freq=0,
+            stop_file_path=STOP_FILE_PATH,
+            load_model_only_folder=None,
+            load_given_ckpt=False,
+            load_ckpt_folder=None,
+            is_old_api=True,
+        ),
+    )
+
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12376"
+
+    initialize_distributed_env(config=init_config, launcher="torch", master_port=12376, args_check=False)
+    train_state = TrainState(init_config, None)
+    ckpt_mm = CheckpointManager(ckpt_config, model=None, optimizer=None)
+    if rank == 0:
+        with open(STOP_FILE_PATH, "w+") as f:
+            f.write("5")
+    dist.barrier()
+    for i in range(10):
+        train_state.step_count = i
+        now_break, now_save_ckpt, save_type = ckpt_mm.quit_signal_handler(train_state)
+        print(
+            f"step:{i}, rank:{rank}, now_break:{now_break}, now_save_ckpt:{now_save_ckpt}, save_type:{save_type}",
+            flush=True,
+        )
+        if train_state.step_count == 5:
+            assert now_break is True
+            assert now_save_ckpt is True
+            assert save_type is CheckpointSaveType.NORMAL_CHECKPOINT
+    dist.barrier()
+    gpc.destroy()
+
+
+def test_quit_siganl_handler():  # noqa # pylint: disable=unused-import
+    import multiprocessing
+    from multiprocessing.pool import Pool
+
+    world_size = 2
+    with Pool(processes=world_size, context=multiprocessing.get_context("spawn")) as pool:
+        items = [(0,), (1,)]
+        for result in pool.starmap(query_quit_file, items):
+            print(f"Got result: {result}", flush=True)
+
+    os.remove(STOP_FILE_PATH)
+
+
+if __name__ == "__main__":
+    pytest.main()
--- a/tests/test_utils/test_storage_manager.py
+++ b/tests/test_utils/test_storage_manager.py
@ -0,0 +1,89 @@
+import os
+
+import pytest
+import torch
+
+from internlm.core.context.parallel_context import Config
+from internlm.initialize.launch import get_config_value
+from tests.test_utils.common_fixture import (  # noqa # pylint: disable=unused-import
+    ASYNC_TMP_FOLDER,
+    BOTO_SAVE_PATH,
+    LOCAL_SAVE_PATH,
+    del_tmp_file,
+    init_dist_and_model,
+    reset_singletons,
+)
+
+ASYNC_TMP_FOLDER = "./async_tmp_folder"
+ckpt_config_list = [
+    # async boto
+    dict(
+        enable_save_ckpt=True,
+        async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+        async_upload=True,
+        save_folder=BOTO_SAVE_PATH,
+        test_id=0,
+    ),
+    # sync local
+    dict(
+        enable_save_ckpt=True,
+        async_upload_tmp_folder=None,
+        async_upload=False,
+        save_folder=LOCAL_SAVE_PATH,
+        test_id=1,
+    ),
+    # sync boto
+    dict(
+        enable_save_ckpt=True,
+        async_upload_tmp_folder=None,
+        async_upload=False,
+        save_folder=BOTO_SAVE_PATH,
+        test_id=2,
+    ),
+    # async local
+    dict(
+        enable_save_ckpt=True,
+        async_upload_tmp_folder=ASYNC_TMP_FOLDER,
+        async_upload=True,
+        save_folder=LOCAL_SAVE_PATH,
+        test_id=3,
+    ),
+]
+
+
+@pytest.fixture(scope="function")
+def del_tmp():
+    del_tmp_file()
+    yield
+    del_tmp_file()
+
+
+@pytest.mark.usefixtures("del_tmp")
+@pytest.mark.usefixtures("reset_singletons")
+@pytest.mark.parametrize("ckpt_config", ckpt_config_list)
+def test_storage_mm_save_load(ckpt_config, init_dist_and_model):  # noqa # pylint: disable=unused-argument
+    from internlm.utils.storage_manager import (
+        check_folder,
+        get_fns,
+        init_storage_manager,
+        llm_load,
+        llm_save,
+        wait_async_upload_finish,
+    )
+
+    ckpt_config = Config(ckpt_config)
+    enable_save_ckpt = get_config_value(ckpt_config, "enable_save_ckpt", False)
+    async_upload_tmp_folder = get_config_value(ckpt_config, "async_upload_tmp_folder", False)
+    async_upload = get_config_value(ckpt_config, "async_upload", False)
+
+    init_storage_manager(enable_save_ckpt, async_upload_tmp_folder, async_upload)
+
+    tobj = torch.rand(64, 64)
+    save_fn = os.path.join(ckpt_config.save_folder, "test.pt")
+    llm_save(save_fn, tobj)
+    if ckpt_config.test_id == 0:
+        wait_async_upload_finish()
+    check_folder(save_fn)
+    assert get_fns(ckpt_config.save_folder)[0] == "test.pt"
+    load_obj = llm_load(save_fn, map_location="cpu")
+    assert 0 == ((load_obj != tobj).sum())
--- a/tests/test_utils/test_timeout.py
+++ b/tests/test_utils/test_timeout.py
@ -0,0 +1,119 @@
+import fcntl
+import os
+import time
+from multiprocessing import Process
+
+import pytest
+import torch
+import torch.distributed as dist
+
+os.environ["INTERNLM_ENABLE_TIMEOUT"] = "1"  # noqa  # pylint: disable=wrong-import-position
+os.environ["NCCL_TIMEOUT"] = "5"
+from internlm.utils.timeout import llm_timeout
+from tests.test_utils.common_fixture import (  # noqa # pylint: disable=unused-import
+    init_config,
+)
+
+WORLD_SIZE = 2
+
+
+@llm_timeout(2, "fake_timeout_func")
+def fake_timeout_func():
+    time.sleep(10)
+
+
+@llm_timeout(10, "nccl_timeout_func")
+def nccl_timeout_func(rank):
+    # see: https://github.com/pytorch/pytorch/issues/104506#issuecomment-1679762880
+    # 'NCCL_ASYNC_ERROR_HANDLING' cannot take effect on the first collective communication.
+    buff = torch.ones([64, 64]).cuda(rank)
+    dist.all_reduce(buff)  # lazy communicator init
+    torch.cuda.synchronize()
+    if rank == 0:
+        dist.all_reduce(buff)
+        torch.cuda.synchronize()  # main thread will hang at here.
+    else:
+        time.sleep(9999)
+
+
+@llm_timeout(10, "try_file_lock")
+def try_file_lock(rank, stop_file_path):
+    if rank == 1:
+        time.sleep(5)
+
+    with open(stop_file_path, "r", encoding="utf-8") as f:
+        fcntl.flock(f, fcntl.LOCK_EX)  # rank 1 hang.
+        if rank == 0:
+            time.sleep(99999)  # rank 0 hang.
+        f.seek(0)
+        f.read()
+        fcntl.flock(f, fcntl.LOCK_UN)
+
+
+def local_timeout(rank, _):
+
+    try:
+        fake_timeout_func()
+    except TimeoutError as e:
+        print(f"local_timeout, rank:{rank}, e:{e}", flush=True)
+    else:
+        assert False, "It should timeout!"
+
+
+def gpc_timeout(rank, world_size):
+
+    from internlm.initialize import initialize_distributed_env
+
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12377"
+    initialize_distributed_env(config=init_config, launcher="torch", master_port=12377, args_check=False)
+
+    try:
+        nccl_timeout_func(rank)
+    except TimeoutError as e:
+        print(f"gpc_timeout, rank:{rank}, e:{e}", flush=True)
+        time.sleep(5)  # wait rank 0 to be killed
+    else:
+        time.sleep(5)  # give some time to let Watchdog kill rank 0.
+        assert False, "It should timeout!"
+
+
+def file_lock_timeout(rank, _, stop_file_path):
+    if rank == 0:
+        with open(stop_file_path, "w"):
+            pass
+    try:
+        try_file_lock(rank, stop_file_path)
+    except TimeoutError as e:
+        print(e, flush=True)
+    else:
+        assert False, "It should timeout!"
+    finally:
+        if rank == 0:
+            os.remove(stop_file_path)
+
+
+timeout_func_list = [(gpc_timeout, 2, None), (local_timeout, 1, None), (file_lock_timeout, 2, "test_lock.log")]
+
+
+@pytest.mark.parametrize("timeout_func_and_args", timeout_func_list)
+def test_timeout(timeout_func_and_args):
+    timeout_func, world_size, other_args = timeout_func_and_args
+    procs = []
+    for i in range(world_size):
+        if other_args is None:
+            args = (i, world_size)
+        else:
+            args = (i, world_size, other_args)
+        proc = Process(target=timeout_func, args=args)
+        proc.start()
+        procs.append(proc)
+
+    for proc in procs:
+        proc.join(15)
+        if proc.is_alive():
+            proc.terminate()
+            proc.join()
--- a/train.py
+++ b/train.py
@ -35,6 +35,7 @@ from internlm.utils.common import (
    parse_args,
 )
 from internlm.utils.evaluation import evaluate_on_val_dls
+from internlm.utils.gputest import empty_cache_and_diag
 from internlm.utils.logger import get_logger, initialize_uniscale_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.model_checkpoint import CheckpointManager
@ -72,7 +73,6 @@ def main(args):
    total_steps = gpc.config.data.total_steps
    valid_every = gpc.config.data.valid_every
    label_smoothing = gpc.config.loss.label_smoothing
-    lr = gpc.config.adam.lr

    get_tflops_func = partial(
        get_megatron_flops,
@ -95,21 +95,11 @@ def main(args):
    # initialize customed llm logger
    uniscale_logger = initialize_llm_logger(start_time=current_time)

-    # initialize and resume train state
-    train_state = TrainState(gpc.config)
-
    # initialize model
    model = initialize_model()

    with open(args.config, "r") as f:
        config_lines = f.readlines()
-    ckpt_manager = CheckpointManager(
-        ckpt_config=gpc.config.ckpt,
-        model=model,
-        model_config=gpc.config.model,
-        model_config_file="".join(config_lines),
-        feishu_address=gpc.config.alert_address,
-    )

    # initialize loss function
    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
@ -117,15 +107,25 @@ def main(args):
    # initialize the train and validation data loader
    train_dl, dataset_types = get_train_data_loader(num_worker=4)
    val_dls = get_validation_data_loader()
-    train_state.init_batch_sampler(train_dl)

-    # Loading model weights must be done before zero is initialized.
-    ckpt_manager.try_load_model(current_time)
+    # initialize and resume train state
+    train_state = TrainState(gpc.config, train_dl.batch_sampler)

    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)

+    ckpt_manager = CheckpointManager(
+        ckpt_config=gpc.config.ckpt,
+        model=model,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+        train_dl=train_dl,
+        model_config=gpc.config.model,
+        model_config_file="".join(config_lines),
+        feishu_address=gpc.config.monitor.alert.feishu_alert_address,
+    )
+
    # Loading other persistent training states.
-    ckpt_manager.try_resume_training(lr_scheduler, optimizer, lr, train_state, train_dl)
+    ckpt_manager.try_resume_training(train_state, current_time)

    # initialize customed llm writer
    writer = Writer(
@ -194,9 +194,7 @@ def main(args):
    with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof:
        # start iterating the train data and begin training
        for batch_count in range(train_state.batch_count, total_steps):
-            if batch_count % 50 == 0:
-                torch.cuda.empty_cache()
-
+            empty_cache_and_diag(batch_count, interval=gpc.config.data.empty_cache_and_diag_interval)
            start_time = time.time()
            timer("one-batch").start()

@ -238,10 +236,10 @@ def main(args):
                train_state.step_count += 1
            else:
                train_state.inf_nan_skip_batches += 1  # record the amount of updating parameters unsuccessfully.
-                if -1 in grad_norm_groups and gpc.is_rank_for_log():  # -1 encodes a specific failure case
+                if -1 in grad_norm_groups.values() and gpc.is_rank_for_log():  # -1 encodes a specific failure case
                    logger.warning(f"Warning: skip parameter update at step {batch_count}.")
                    send_alert_message(
-                        address=gpc.config.alert_address,
+                        address=gpc.config.monitor.alert.feishu_alert_address,
                        message=f"Warning: skip parameter update at step {batch_count}.",
                    )

@ -302,11 +300,15 @@ if __name__ == "__main__":
    assert hasattr(gpc, "config") and gpc.config is not None

    # initialize monitor manager context
-    with initialize_monitor_manager(job_name=gpc.config.JOB_NAME, alert_address=gpc.config.alert_address):
+    with initialize_monitor_manager(
+        job_name=gpc.config.JOB_NAME, alert_address=gpc.config.monitor.alert.feishu_alert_address
+    ):
        try:
            main(args)
        except Exception:
            logger.error(
                f"Raise exception from {hostname} with rank id: {gpc.get_global_rank()}\n{traceback.format_exc()}",
            )
-            mm.monitor_exception(alert_address=gpc.config.alert_address, excp_info=traceback.format_exc())
+            mm.monitor_exception(
+                alert_address=gpc.config.monitor.alert.feishu_alert_address, excp_info=traceback.format_exc()
+            )
--- a/version.txt
+++ b/version.txt
@ -1 +1 @@
-0.1.0
+0.2.0
 @ -1 +1 @@
 .1.0
 .2.0