diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index c70e69d..75edf17 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -7,7 +7,6 @@ on:
       - "doc/**"
       - "**.md"
 env:
-  WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
   SLURM_PARTITION: llm_s
 
 jobs:
@@ -15,12 +14,9 @@ jobs:
     runs-on: [t_cluster]
     timeout-minutes: 5
     steps:
-    - name: mask env
-      run: |
-        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
     - uses: actions/checkout@v3
 
     - name: training_8GPU
       run: |
         source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
-        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
new file mode 100644
index 0000000..6251459
--- /dev/null
+++ b/.github/workflows/weekly_test.yaml
@@ -0,0 +1,101 @@
+name: weekly-tests
+on:
+  push:
+    branches:
+      - "main"
+env:
+  SLURM_PARTITION: llm_s
+
+jobs:
+  training_8GPU:
+    runs-on: [t_cluster]
+    timeout-minutes: 5
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: training_8GPU
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
+
+  training_16GPU_8DP2TP:
+    runs-on: [t_cluster]
+    timeout-minutes: 5
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: training_16GPU_8DP2TP
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
+
+  training_16GPU_8DP2TPSP:
+    runs-on: [t_cluster]
+    timeout-minutes: 5
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: training_16GPU_8DP2TPSP
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        sed -i 's/^.*tensor=.*/    tensor=2,/' ./configs/7B_sft.py
+        sed -i 's/^.*sequence_parallel=.*/    sequence_parallel=True,/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
+            
+  training_16GPU_8DP2PP:
+    runs-on: [t_cluster]
+    timeout-minutes: 5
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: training_16GPU_8DP2PP
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
+
+  training_16GPU_8DP2PP_InterleavedOverlap:
+    runs-on: [t_cluster]
+    timeout-minutes: 5
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: training_16GPU_8DP2PP_InterleavedOverlap
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
+        sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' ./configs/7B_sft.py
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
+
+  unit_test_optimizer:
+    runs-on: [t_cluster]
+    timeout-minutes: 30
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: test_optimizer
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
+
+  unit_test_model:
+    runs-on: [t_cluster]
+    timeout-minutes: 5
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: test_embedding_accuracy
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
+        
+    - name: test_model_internlm_accuracy
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
+        
+    - name: test_norm_accuracy
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
index bd81fa5..e82a9b1 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
@@ -3,12 +3,11 @@
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
-#, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2023-09-13 17:07+0800\n"
+"POT-Creation-Date: 2023-09-15 19:06+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
@@ -20,7 +19,7 @@ msgstr ""
 "Generated-By: Babel 2.12.1\n"
 
 #: ../../source/checkpoint.rst:2
-msgid "模型保存"
+msgid "模型加载与保存"
 msgstr "Model Checkpointing"
 
 #: ../../source/checkpoint.rst:4
@@ -36,12 +35,86 @@ msgstr ""
 
 #: ../../source/checkpoint.rst:6
 msgid "InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。"
-msgstr "InternLM supports automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit. "
+msgstr "InternLM supports automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit."
 
 #: ../../source/checkpoint.rst:9
-msgid "Checkpointing"
+msgid "CheckpointManager"
 msgstr ""
 
+#: ../../source/checkpoint.rst:11
+msgid ""
+"``CheckpointManager`` "
+"是InternLM负责进行模型加载和保存的工具类，其会使用config文件中的ckpt字段的初始化参数字典初始化自身的参数，目前相关的参数有："
+msgstr ""
+"CheckpointManager is the utility class within InternLM responsible for "
+"model loading and saving. It initializes its own parameters using the "
+"initialization parameter dictionary from the 'ckpt' field in the config "
+"file. Currently, the relevant parameters are as follows"
+
+#: ../../source/checkpoint.rst:13
+msgid "``enable_save_ckpt``: 是否开启检查点存储功能（不影响检查点加载）。参数类型 ``bool``，必选参数。"
+msgstr ""
+"``enable_save_ckpt``: Whether to enable checkpoint storage functionality "
+"(does not affect checkpoint loading). Parameter type: `bool`, it is a "
+"required parameter."
+
+#: ../../source/checkpoint.rst:15
+msgid "``save_ckpt_folder``: 检查点存储路径，参数类型 ``str``，默认为： ``None``，在开启检查点存储功能时为必选参数。"
+msgstr ""
+"``save_ckpt_folder``: Checkpoint storage path. Parameter type: ``str``. "
+"This is a required parameter when enabling checkpoint storage "
+"functionality."
+
+#: ../../source/checkpoint.rst:17
+msgid "``checkpoint_every``: 检查点存储频率，参数类型 ``int``，默认为： ``50``。"
+msgstr ""
+"``checkpoint_every``: Checkpoint storage frequency. Parameter type: "
+"``int``."
+
+#: ../../source/checkpoint.rst:19
+msgid ""
+"``load_ckpt_folder``: 初始化检查点/权重加载路径。参数类型 ``str``，默认为： ``None``，详见 :ref"
+":`load-ckpt-folder`。"
+msgstr ""
+"``load_ckpt_folder``: Initialization checkpoint/weight loading path. "
+"Parameter type: ``str``. Default is ``None``. :ref:`load-ckpt-folder`"
+
+#: ../../source/checkpoint.rst:21
+msgid "``async_upload``: 是否开启异步上传，默认值为：``False``，详见 :ref:`asyncupload`。"
+msgstr ""
+"``async_upload``: Whether to enable asynchronous uploading. See "
+"documentation for more details :ref:`asyncupload`"
+
+#: ../../source/checkpoint.rst:23
+msgid "``async_upload_tmp_folder``: 异步上传临时存储路径。"
+msgstr ""
+"``async_upload_tmp_folder``: Temporary storage path for asynchronous "
+"uploading."
+
+#: ../../source/checkpoint.rst:25
+msgid ""
+"``oss_snapshot_freq``: 快照存储频率，默认值为：``checkpoint_every``的一半。详见 "
+":ref:`snapshot`。"
+msgstr ""
+"``oss_snapshot_freq``: Snapshot storage frequency. See documentation for "
+"more details :ref:`snapshot`."
+
+#: ../../source/checkpoint.rst:27
+msgid "``auto_resume``: 是否开启检查点自动恢复，默认值为：``True``，详见 :ref:`autoresume`。"
+msgstr ""
+"``auto_resume``: Whether to enable automatic checkpoint resume. See "
+"documentation for more details :ref:`autoresume`."
+
+#: ../../source/checkpoint.rst:29
+msgid "``stop_file_path`` : 检查点存储控制文件的路径，默认值为：``None``，详见 :ref:`stopfile`。"
+msgstr ""
+"``stop_file_path``: Path to the checkpoint storage control file. See "
+"documentation for more details :ref:`stopfile`."
+
+#: ../../source/checkpoint.rst:32
+msgid "下面给出config文件的参数设置例子："
+msgstr "Here is an example of parameter settings in the config file."
+
 #: internlm.utils.model_checkpoint.CheckpointManager:1 of
 msgid "StorageManagerContext"
 msgstr ""
@@ -86,21 +159,253 @@ msgstr ""
 msgid "Save checkpoint to the given folder path."
 msgstr ""
 
-#~ msgid "Attempt to restore the training state of the last ckpt."
-#~ msgstr ""
+#: ../../source/checkpoint.rst:53
+msgid "加载与存储格式约定"
+msgstr "Model loading and saving path format conventions."
 
-#~ msgid "lr_scheduler object."
-#~ msgstr ""
+#: ../../source/checkpoint.rst:58
+msgid "(1) 路径格式约定"
+msgstr "(1) Path format conventions."
 
-#~ msgid "optimizer object."
-#~ msgstr ""
+#: ../../source/checkpoint.rst:60
+msgid "InternLM对config中出现的所有存储路径都遵循以下的路径格式约定:"
+msgstr ""
+"InternLM follows the following path format conventions for all storage "
+"paths specified in the config:"
 
-#~ msgid "learning rate."
-#~ msgstr ""
+#: ../../source/checkpoint.rst:66
+msgid "对于不同backend的路径，有以下的规则需要注意:"
+msgstr "For paths of different backends, the following rules should be noted:"
 
-#~ msgid "traing states."
-#~ msgstr ""
+#: ../../source/checkpoint.rst:68
+msgid ""
+"如果需要使用boto3的路径，需要在运行前提前导入 ``S3_ACCESS_KEY_ID`` 和 "
+"``S3_SECRET_ACCESS_KEY_ID`` 这两个环境变量。"
+msgstr ""
+"If you need to use paths with Boto3, make sure to import the "
+"``S3_ACCESS_KEY_ID`` and ``S3_SECRET_ACCESS_KEY_ID`` environment "
+"variables before running."
 
-#~ msgid "traning dataloader object"
-#~ msgstr ""
+#: ../../source/checkpoint.rst:70
+msgid "bucket的endpoint一般分为Inside IP和Outside IP，如果可以尽量使用inside IP，会获得更佳的存储速度。"
+msgstr ""
+"The bucket's endpoint is typically divided into Inside IP and Outside IP."
+" Whenever possible, it's advisable to use the Inside IP to achieve better"
+" storage speed."
 
+#: ../../source/checkpoint.rst:75
+msgid "(2) 模型加载(load_ckpt_folder)格式约定"
+msgstr "(2) Model loading format conventions (load_ckpt_folder)."
+
+#: ../../source/checkpoint.rst:77
+msgid "load_ckpt_folder 由三个字段组成， ``path`` 、 ``content`` 和 ``ckpt_type`` 。"
+msgstr ""
+"``load_ckpt_folder`` consists of three fields: ``path``, ``content``, and"
+" ``ckpt_type``."
+
+#: ../../source/checkpoint.rst:79
+msgid "``path``：给出了检查点/初始化模型权重的加载路径（path的格式见下小节）"
+msgstr ""
+"``path``: Specifies the loading path for the checkpoint/initial model "
+"weights (the format of the path is described in the following "
+"subsection)."
+
+#: ../../source/checkpoint.rst:81
+msgid "``content``: 表示需要加载的内容，目前支持的字段包括："
+msgstr ""
+"``content``: Indicates the content to be loaded, currently supported "
+"fields include:"
+
+#: ../../source/checkpoint.rst:83
+msgid "``model``：加载模型权重。"
+msgstr "``model``: Load model weights."
+
+#: ../../source/checkpoint.rst:84
+msgid "``sampler``：加载sampler状态。"
+msgstr "``sampler``: Load sampler state."
+
+#: ../../source/checkpoint.rst:85
+msgid "``scheduler``：加载lr_scheduler状态。"
+msgstr "``scheduler``: Load lr_scheduler state."
+
+#: ../../source/checkpoint.rst:86
+msgid "``optimzier``：加载optimizer状态。"
+msgstr "``optimizer``: Load optimizer state."
+
+#: ../../source/checkpoint.rst:87
+msgid "``all``：表示所有状态均加载，一般在resume训练使用。"
+msgstr ""
+"``all``: Indicates that all states should be loaded, typically used for "
+"resuming training."
+
+#: ../../source/checkpoint.rst:89
+msgid "``ckpt_type``：表示加载的模型权重类型，目前支持的字段包括："
+msgstr ""
+"``ckpt_type``: Represents the type of model weight to be loaded, "
+"currently supported fields include:"
+
+#: ../../source/checkpoint.rst:91
+msgid "``internlm``：internlm约定的checkpoint存储格式。"
+msgstr "``internlm``: Checkpoint storage format as per InternLM conventions."
+
+#: ../../source/checkpoint.rst:93
+msgid "下面给出两个例子："
+msgstr "Here are two examples:"
+
+#: ../../source/checkpoint.rst:107
+msgid "异步上传"
+msgstr "Asynchronous upload."
+
+#: ../../source/checkpoint.rst:109
+msgid ""
+"异步上传会先同步的将模型存储到 ``async_upload_tmp_folder`` "
+"中，再异步的写入远端存储（OSS/NFS）中。从而避免存储ckpt阻塞训练过长时间。"
+msgstr ""
+"Asynchronous upload first synchronously stores the model in the "
+"``async_upload_tmp_folder`` and then asynchronously writes it to remote "
+"storage (OSS/NFS). This helps prevent blocking training for extended "
+"periods while storing checkpoints."
+
+#: ../../source/checkpoint.rst:111 ../../source/checkpoint.rst:129
+#: ../../source/checkpoint.rst:145 ../../source/checkpoint.rst:160
+msgid "config.ckpt 中相关的参数："
+msgstr "The parameters related to ``config.ckpt`` are:"
+
+#: ../../source/checkpoint.rst:113
+msgid "``async_upload``: 是否开启异步上传。参数类型 ``bool/None``，默认为 ``False``。"
+msgstr ""
+"``async_upload``: Whether to enable asynchronous upload. Parameter type: "
+"``bool/None``. Default is ``False``."
+
+#: ../../source/checkpoint.rst:115
+msgid ""
+"``async_upload_tmp_folder``: 异步上传临时存储路径。参数类型 ``str/None``, 默认值为 "
+"``/dev/shm/{JOB_NAME}_tmp_ckpt/``。"
+msgstr ""
+"`async_upload_tmp_folder`: Temporary storage path for asynchronous "
+"upload. Parameter type: `str/None`. Default value is "
+"``/dev/shm/{JOB_NAME}_tmp_ckpt/``."
+
+#: ../../source/checkpoint.rst:117
+msgid "需要注意的是，异步上传功能仅在backend为boto3时才会有效果，bcakend为local时只支持同步存储。"
+msgstr ""
+"It's important to note that asynchronous upload functionality is only "
+"effective when the backend is set to \"boto3.\" When the backend is set "
+"to \"local,\" only synchronous storage is supported."
+
+#: ../../source/checkpoint.rst:119
+msgid ""
+"``async_upload_tmp_folder`` "
+"设置的的原则为尽量设置为计算节点的local目录，这样才可以获得最佳的异步上传速度，一般来说建议为 ``/dev/shm`` 或 "
+"``/nvme`` 下的路径，如果使用同步上传，则该路径可不给。"
+msgstr ""
+"The setting principle is to try to set it to the local directory of the "
+"computing node, so as to obtain the best asynchronous upload speed. "
+"Generally speaking, it is recommended to use the path under ``/dev/shm`` "
+"or ``/nvme``. If If you use synchronous upload, this path does not need "
+"to be given."
+
+#: ../../source/checkpoint.rst:125
+msgid "快照检查点"
+msgstr "Snapshot Checkpoint"
+
+#: ../../source/checkpoint.rst:127
+msgid ""
+"快照检查点是一种特殊的检查点，其是为了减少模型因为训练崩溃（ECC error, NCCL error, "
+".etc）等问题导致训练任务崩溃而损失的训练进度。其采用交替覆盖写的策略，所占用的存储大小为两个step的检查点所需的空间。配合上异步的检查点写入，在不影响训练速度和存储容量的条件下极大的增大了检查点的存储频率。"
+msgstr ""
+"Snapshot checkpoint is a special checkpoint that is used to reduce the "
+"loss of training progress due to training task crashes caused by problems"
+" such as training crashes (ECC error, NCCL error.etc). It adopts an "
+"alternating overwriting strategy, and the storage size occupied is the "
+"space required for the checkpoints of two steps. Coupled with "
+"asynchronous checkpoint writing, it greatly increases the storage "
+"frequency of checkpoints without affecting training speed and storage "
+"capacity."
+
+#: ../../source/checkpoint.rst:131
+msgid "``oss_snapshot_freq``: 快照存储频率。参数类型 ``int/None``，默认为 ``50``。"
+msgstr ""
+"``oss_snapshot_freq``: Snapshot storage frequency. Parameter type "
+"``int/None``, default is ``50``"
+
+#: ../../source/checkpoint.rst:133
+msgid ""
+"``oss_snapshot_freq`` 可以根据模型每step时间酌情设置，一般快照频率在1小时以下，半小时以上为怡/不给（默认值是 "
+"``checkpoint_every`` 的二分之一）。"
+msgstr ""
+"``oss_snapshot_freq`` can be set according to the time of each step of "
+"the model. Generally, the snapshot frequency is less than 1 hour, and it "
+"is Yi/Non for more than half an hour (the default value is one-half of "
+"``checkpoint_every``)"
+
+#: ../../source/checkpoint.rst:139
+msgid "检查点自动恢复"
+msgstr "Checkpoint automatic recovery"
+
+#: ../../source/checkpoint.rst:141
+msgid ""
+"检查点自动加载功能的目的是在resume训练时，自动加载 ``save_ckpt_folder`` "
+"路径下最新的检查点（包括snapshot检查点）。配合上自动重启机制，可以实现无人干预的任务自动恢复。"
+msgstr ""
+"The purpose of Checkpoint automatic recovery is to automatically load the"
+" latest checkpoint (including snapshot checkpoint) under the "
+"``save_ckpt_folder`` path during resume training. Coupled with the "
+"automatic restart mechanism, tasks can be automatically restored without "
+"human intervention."
+
+#: ../../source/checkpoint.rst:143
+msgid ""
+"该功能默认开启，所以要注意如果需要加载 ``load_ckpt_folder`` 路径下的模型权重，要将 ``auto_resume`` 设置为 "
+"False，否则可能会产生预期外的行为。"
+msgstr ""
+"This function is enabled by default, so please note that if you need to "
+"load the model weights under the ``load_ckpt_folder`` path, you must set "
+"``auto_resume`` to ``False``, otherwise unexpected behavior may occur."
+
+#: ../../source/checkpoint.rst:147
+msgid "``auto_resume``: 是否开启检查点自动恢复。参数类型 ``bool``，默认为 ``True``。"
+msgstr ""
+"``auto_resume``: Whether to enable automatic checkpoint recovery. "
+"Parameter type ``bool``, default is ``True``"
+
+#: ../../source/checkpoint.rst:149
+msgid ""
+"``auto_resume`` 如果为True，则尝试从 ``save_ckpt_folder`` "
+"路径中自动加载最新的ckpt，如果找不到，则从step 0开始训练。如果为False，则尝试从 ``load_ckpt_folder`` "
+"中加载模型参数。"
+msgstr ""
+"``auto_resume`` If True, attempts to save_ckpt_folder`Automatically load "
+"the latest ckpt in the path. If not found, training will start from step "
+"0. If False, try to load model parameters from ``load_ckpt_folder``"
+
+#: ../../source/checkpoint.rst:155
+msgid "手动控制检查点存储"
+msgstr "Manual control of checkpoint storage"
+
+#: ../../source/checkpoint.rst:157
+msgid ""
+"在模型距离下一次检查点存储还有很长时间，这时如果希望立刻停止一个任务，又不希望丢失目前训练进度时可以使用手动控制检查点存储功能。通过向一个位于NFS上的"
+" ``stop_file_path`` 文件中写入希望任务停止的step步数，Global Rank "
+"0的进程会在每个step轮询该文件的值，如果发现有我们给出的停止step，则会进行一次广播通知所有的训练进程，约定各进程在训练到该step时存储一个检查点，并选择是否退出。"
+msgstr ""
+"When the model is still a long time away from the next checkpoint "
+"storage, if you want to stop a task immediately and do not want to lose "
+"the current training progress, you can use the manual control checkpoint "
+"storage function. By writing the number of steps you want the task to "
+"stop to a ``stop_file_path`` file located on NFS, the Global Rank 0 "
+"process will poll the value of the file at each step. If it finds that "
+"there is a stop step we gave , a broadcast will be performed to notify "
+"all training processes, and it is agreed that each process will store a "
+"checkpoint when training reaches this step, and choose whether to exit."
+
+#: ../../source/checkpoint.rst:162
+msgid "``stop_file_path``：检查点存储控制文件的路径，参数类型 ``str/None``，默认为 ``None``，表示关闭该功能。"
+msgstr ""
+"``stop_file_path``: The path of the checkpoint storage control file, "
+"parameter type ``str/None``, the default is ``None``, indicating to turn "
+"off this function"
+
+#: ../../source/checkpoint.rst:164
+msgid "下面给出一个写入 ``stop_file_path`` 的例子："
+msgstr "An example of writing to ``stop_file_path`` is given below:"
diff --git a/doc/code-docs/source/checkpoint.rst b/doc/code-docs/source/checkpoint.rst
index ee4f037..cd9b755 100644
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@@ -1,12 +1,172 @@
-模型保存
+模型加载与保存
 ===================
 
 InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。其中，可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。
 
 InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。
 
-Checkpointing
--------------
+CheckpointManager
+--------------------------
+
+``CheckpointManager`` 是InternLM负责进行模型加载和保存的工具类，其会使用config文件中的ckpt字段的初始化参数字典初始化自身的参数，目前相关的参数有：
+
+- ``enable_save_ckpt``: 是否开启检查点存储功能（不影响检查点加载）。参数类型 ``bool``，必选参数。
+
+- ``save_ckpt_folder``: 检查点存储路径，参数类型 ``str``，默认为： ``None``，在开启检查点存储功能时为必选参数。
+
+- ``checkpoint_every``: 检查点存储频率，参数类型 ``int``，默认为： ``50``。
+
+- ``load_ckpt_folder``: 初始化检查点/权重加载路径。参数类型 ``str``，默认为： ``None``，详见 :ref:`load-ckpt-folder`。
+
+- ``async_upload``: 是否开启异步上传，默认值为：``False``，详见 :ref:`asyncupload`。
+
+- ``async_upload_tmp_folder``: 异步上传临时存储路径。
+
+- ``oss_snapshot_freq``: 快照存储频率，默认值为：``checkpoint_every``的一半。详见 :ref:`snapshot`。
+
+- ``auto_resume``: 是否开启检查点自动恢复，默认值为：``True``，详见 :ref:`autoresume`。
+
+- ``stop_file_path`` : 检查点存储控制文件的路径，默认值为：``None``，详见 :ref:`stopfile`。
+
+
+下面给出config文件的参数设置例子：
+
+.. code-block:: python
+
+  ckpt = dict(
+      enable_save_ckpt=False,  # enable ckpt save.
+      save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+      load_ckpt_folder=dict(path="local:/mnt/mfs/ckpt", content=["all",], ckpt_type="internlm"), 
+      auto_resume=False, # disable auto-resume, internlm will load model checkpoint from the path of 'load_ckpt_folder'.
+      checkpoint_every=CHECKPOINT_EVERY,
+      async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+      async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+      oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+  )
+
 
 .. autoclass:: internlm.utils.model_checkpoint.CheckpointManager
     :members:
+
+
+加载与存储格式约定
+--------------------------
+
+.. _load-ckpt-folder:
+
+(1) 路径格式约定
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+InternLM对config中出现的所有存储路径都遵循以下的路径格式约定:
+
+.. figure:: ../../imgs/ckpt_path_format_CN.png
+  :scale: 30%
+  :class: with-border
+
+对于不同backend的路径，有以下的规则需要注意:
+
+1. 如果需要使用boto3的路径，需要在运行前提前导入 ``S3_ACCESS_KEY_ID`` 和 ``S3_SECRET_ACCESS_KEY_ID`` 这两个环境变量。
+
+2. bucket的endpoint一般分为Inside IP和Outside IP，如果可以尽量使用inside IP，会获得更佳的存储速度。
+
+
+
+(2) 模型加载(load_ckpt_folder)格式约定
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+load_ckpt_folder 由三个字段组成， ``path`` 、 ``content`` 和 ``ckpt_type`` 。
+
+- ``path``：给出了检查点/初始化模型权重的加载路径（path的格式见下小节）
+
+- ``content``: 表示需要加载的内容，目前支持的字段包括：
+
+  - ``model``：加载模型权重。
+  - ``sampler``：加载sampler状态。
+  - ``scheduler``：加载lr_scheduler状态。
+  - ``optimzier``：加载optimizer状态。
+  - ``all``：表示所有状态均加载，一般在resume训练使用。
+
+- ``ckpt_type``：表示加载的模型权重类型，目前支持的字段包括：
+
+  - ``internlm``：internlm约定的checkpoint存储格式。
+
+下面给出两个例子：
+
+.. code-block:: python
+
+  # 从文件存储相对路径 ckpt_model 中加载已有模型权重初始化模型，适合 sft 等训练初始化
+  load_ckpt_folder= dict(path="local:ckpt_model", content=["model",], ckpt_type="internlm")
+
+  # 从文件存储相对路径 ckpt_model 中加载所有的状态，适合断点续训的场景
+  load_ckpt_folder= dict(path="local:ckpt_model", content=["all",], ckpt_type="internlm")
+
+
+.. _asyncupload:
+
+异步上传
+--------------------------
+
+异步上传会先同步的将模型存储到 ``async_upload_tmp_folder`` 中，再异步的写入远端存储（OSS/NFS）中。从而避免存储ckpt阻塞训练过长时间。
+
+config.ckpt 中相关的参数：
+
+- ``async_upload``: 是否开启异步上传。参数类型 ``bool/None``，默认为 ``False``。
+
+- ``async_upload_tmp_folder``: 异步上传临时存储路径。参数类型 ``str/None``, 默认值为 ``/dev/shm/{JOB_NAME}_tmp_ckpt/``。
+
+需要注意的是，异步上传功能仅在backend为boto3时才会有效果，bcakend为local时只支持同步存储。
+
+``async_upload_tmp_folder`` 设置的的原则为尽量设置为计算节点的local目录，这样才可以获得最佳的异步上传速度，一般来说建议为 ``/dev/shm`` 或 ``/nvme`` 下的路径，如果使用同步上传，则该路径可不给。
+
+
+.. _snapshot:
+
+快照检查点
+--------------------------
+
+快照检查点是一种特殊的检查点，其是为了减少模型因为训练崩溃（ECC error, NCCL error, .etc）等问题导致训练任务崩溃而损失的训练进度。其采用交替覆盖写的策略，所占用的存储大小为两个step的检查点所需的空间。配合上异步的检查点写入，在不影响训练速度和存储容量的条件下极大的增大了检查点的存储频率。
+
+config.ckpt 中相关的参数：
+
+- ``oss_snapshot_freq``: 快照存储频率。参数类型 ``int/None``，默认为 ``50``。
+
+``oss_snapshot_freq`` 可以根据模型每step时间酌情设置，一般快照频率在1小时以下，半小时以上为怡/不给（默认值是 ``checkpoint_every`` 的二分之一）。
+
+
+.. _autoresume:
+
+检查点自动恢复
+--------------------------
+
+检查点自动加载功能的目的是在resume训练时，自动加载 ``save_ckpt_folder`` 路径下最新的检查点（包括snapshot检查点）。配合上自动重启机制，可以实现无人干预的任务自动恢复。
+
+该功能默认开启，所以要注意如果需要加载 ``load_ckpt_folder`` 路径下的模型权重，要将 ``auto_resume`` 设置为 False，否则可能会产生预期外的行为。
+
+config.ckpt 中相关的参数：
+
+- ``auto_resume``: 是否开启检查点自动恢复。参数类型 ``bool``，默认为 ``True``。
+
+``auto_resume`` 如果为True，则尝试从 ``save_ckpt_folder`` 路径中自动加载最新的ckpt，如果找不到，则从step 0开始训练。如果为False，则尝试从 ``load_ckpt_folder`` 中加载模型参数。
+
+
+.. _stopfile:
+
+手动控制检查点存储
+--------------------------
+
+在模型距离下一次检查点存储还有很长时间，这时如果希望立刻停止一个任务，又不希望丢失目前训练进度时可以使用手动控制检查点存储功能。通过向一个位于NFS上的 ``stop_file_path`` 文件中写入希望任务停止的step步数，Global Rank 0的进程会在每个step轮询该文件的值，如果发现有我们给出的停止step，则会进行一次广播通知所有的训练进程，约定各进程在训练到该step时存储一个检查点，并选择是否退出。
+
+
+config.ckpt 中相关的参数：
+
+- ``stop_file_path``：检查点存储控制文件的路径，参数类型 ``str/None``，默认为 ``None``，表示关闭该功能。
+
+下面给出一个写入 ``stop_file_path`` 的例子：
+
+.. code-block:: bash
+
+  # 我们希望停止的step步数
+  # 如果存入的step>0，则任务会在存储ckpt后自动退出
+  # 如果存入的step<0，则任务会在存储ckpt后会继续训练
+  echo "999" > ./llm_alter/1006_pr.log
+
diff --git a/doc/imgs/ckpt_path_format_CN.png b/doc/imgs/ckpt_path_format_CN.png
new file mode 100644
index 0000000..0307d22
Binary files /dev/null and b/doc/imgs/ckpt_path_format_CN.png differ
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index 1463cc1..2a617f4 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -2,6 +2,7 @@
 # -*- encoding: utf-8 -*-
 
 import argparse
+import gc
 import os
 from pathlib import Path
 from typing import Dict, Union
@@ -448,6 +449,8 @@ def initialize_distributed_env(
         master_port (str): The master port for distributed training. 8888 by default.
         seed (int, optional): Specified random seed for every process. 1024 by default.
     """
+    # close automatic garbage collection
+    gc.disable()
 
     torch.cuda.empty_cache()
 
diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py
index ddb4932..48877b9 100644
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+import gc
 import math
 import socket
 
@@ -41,6 +42,8 @@ def empty_cache_and_diag(batch_count, interval=50):
                 bench_net()
         # do empty_cache after the bench
         torch.cuda.empty_cache()
+        # do garbage collection
+        gc.collect()
 
 
 def benchmark_forward(
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index 6c9d828..2f52500 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -1,4 +1,5 @@
 import math
+import os
 import subprocess
 
 import pytest
@@ -24,7 +25,7 @@ from internlm.utils.gputest import empty_cache_and_diag
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.model_checkpoint import CheckpointManager
 
-CONFIG_FILE_PATH = "./configs/7B_sft.py"
+CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH", "./configs/7B_sft.py")
 TOTAL_STEPS = 10
 LOSS_SPIKE_LIMIT = 1.5
 LOSS_DEVIATION_LIMIT = 0.2
@@ -43,11 +44,40 @@ BASELINE_LOSS_LIST = [
 cur_loss_list = []
 
 
-def train():
+def train(
+    dp_size: int = 1,
+    tp_size: int = 1,
+    pp_size: int = 1,
+    num_chunks: int = 2,
+    interleaved: bool = False,
+    enable_sp: bool = False,
+):
     # initialize distributed environment
     initialize_distributed_env(config=CONFIG_FILE_PATH)
     assert hasattr(gpc, "config") and gpc.config is not None
 
+    # check parallel config
+    assert (
+        gpc.get_world_size(ParallelMode.DATA) == dp_size
+    ), f"data parallel size: {gpc.get_world_size(ParallelMode.DATA)} is not as expected {dp_size}"
+    assert (
+        gpc.get_world_size(ParallelMode.TENSOR) == tp_size
+    ), f"tensor parallel size: {gpc.get_world_size(ParallelMode.TENSOR)} is not as expected {tp_size}"
+    assert (
+        gpc.get_world_size(ParallelMode.PIPELINE) == pp_size
+    ), f"pipeline parallel size: {gpc.get_world_size(ParallelMode.PIPELINE)} is not as expected {pp_size}"
+    if interleaved:
+        assert (
+            gpc.is_using_pp() and hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks == num_chunks
+        )
+        assert gpc.config.parallel["pipeline"].get(
+            "interleaved_overlap", False
+        ), "interleaved overlap must be enabled when using interleave pipeline scheduler"
+    if enable_sp:
+        assert gpc.config.parallel.get(
+            "sequence_parallel", False
+        ), "sequence_parallel must be True when enable_sp is True"
+
     # init setting
     gpc.config.data.total_steps = TOTAL_STEPS
     gpc.config.lr_scheduler.total_steps = TOTAL_STEPS
@@ -193,198 +223,61 @@ def check_loss_accuracy():
             ), f"The loss accuracy is abnormal, {target}->{cur}, please check it!"
 
 
-class TestCaseTrain8GPU:
-    """
-    Test cases for Model Training with 8 GPUs.
-    Parallel Config:
-        data parallel size = 8.
-    """
+@pytest.mark.training_8GPU
+def test_training_loss_with_dp8():
+    # model training
+    train(dp_size=8)
 
-    @staticmethod
-    def setup_class():
-        # model training
-        train()
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
 
-        # print loss value
-        print(f"cur_loss_list: {cur_loss_list}", flush=True)
-
-    @staticmethod
-    @pytest.mark.training_8GPU
-    def test_loss_spike_with_dp8():
-        check_loss_spike()
-
-    @staticmethod
-    @pytest.mark.training_8GPU
-    def test_loss_accuracy_with_dp8():
-        check_loss_accuracy()
+    check_loss_spike()
+    check_loss_accuracy()
 
 
-class TestCaseTrain16GPUWith8DP2TP:
-    """
-    Test cases for Model Training with 16 GPUs.
-    Parallel Config:
-        data parallel size = 8.
-        tensor parallel size = 2.
-    """
+@pytest.mark.training_16GPU_8DP2TP
+def test_training_loss_with_dp8_tp2():
+    # model training
+    train(dp_size=8, tp_size=2)
 
-    @staticmethod
-    def setup_class():
-        # update config tensor parallel size
-        command = f"sed -i 's/^.*tensor=.*/    tensor=2,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
 
-        # model training
-        train()
-
-        # print loss value
-        print(f"cur_loss_list: {cur_loss_list}", flush=True)
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2TP
-    def test_loss_spike_with_dp8_tp2():
-        check_loss_spike()
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2TP
-    def test_loss_accuracy_with_dp8_tp2():
-        check_loss_accuracy()
+    check_loss_spike()
+    check_loss_accuracy()
 
 
-class TestCaseTrain16GPUWith8DP2TPSP:
-    """
-    Test cases for Model Training with 16 GPUs.
-    Parallel Config:
-        data parallel size = 8.
-        tensor parallel size = 2.
-        sequence parallel = True.
-    """
+@pytest.mark.training_16GPU_8DP2TPSP
+def test_training_loss_with_dp8_tp2_sp():
+    # model training
+    train(dp_size=8, tp_size=2, enable_sp=True)
 
-    @staticmethod
-    def setup_class():
-        # update config tensor parallel size and sequence parallel
-        command = f"sed -i 's/^.*tensor=.*/    tensor=2,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-        command = f"sed -i 's/^.*sequence_parallel=.*/    sequence_parallel=True,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
 
-        # model training
-        train()
-
-        # print loss value
-        print(f"cur_loss_list: {cur_loss_list}", flush=True)
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2TPSP
-    def test_loss_spike_with_dp8_tp2_sp():
-        check_loss_spike()
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2TPSP
-    def test_loss_accuracy_with_dp8_tp2_sp():
-        check_loss_accuracy()
+    check_loss_spike()
+    check_loss_accuracy()
 
 
-class TestCaseTrain16GPUWith8DP2PP:
-    """
-    Test cases for Model Training with 16 GPUs.
-    Parallel Config:
-        data parallel size = 8.
-        pipeline parallel size = 2.
-    """
+@pytest.mark.training_16GPU_8DP2PP
+def test_training_loss_with_dp8_pp2():
+    # model training
+    train(dp_size=8, pp_size=2)
 
-    @staticmethod
-    def setup_class():
-        # update config pipeline parallel size
-        command = f"sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-        command = f"sed -i 's/^.*tensor=.*/    tensor=1,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
 
-        # model training
-        train()
-
-        # print loss value
-        print(f"cur_loss_list: {cur_loss_list}", flush=True)
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2PP
-    def test_loss_spike_with_dp8_pp2():
-        check_loss_spike()
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2PP
-    def test_loss_accuracy_with_dp8_pp2():
-        check_loss_accuracy()
+    check_loss_spike()
+    check_loss_accuracy()
 
 
-class TestCaseTrain16GPUWith8DP2PPInterleaved:
-    """
-    Test cases for Model Training with 16 GPUs.
-    Parallel Config:
-        data parallel size = 8.
-        pipeline parallel size = 2.
-        interleaved scheduler = True.
-    """
+@pytest.mark.training_16GPU_8DP2PP_InterleavedOverlap
+def test_training_loss_with_dp8_pp2_interleaved_overlap():
+    # model training
+    train(dp_size=8, pp_size=2, interleaved=True)
 
-    @staticmethod
-    def setup_class():
-        # update config pipeline parallel size
-        command = f"sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2),/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-        command = f"sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-        command = f"sed -i 's/^.*tensor=.*/    tensor=1,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=False)
+    # print loss value
+    print(f"cur_loss_list: {cur_loss_list}", flush=True)
 
-        # model training
-        train()
-
-        # print loss value
-        print(f"cur_loss_list: {cur_loss_list}", flush=True)
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2PP_Interleaved
-    def test_loss_spike_with_dp8_pp2_interleaved():
-        check_loss_spike()
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2PP_Interleaved
-    def test_loss_accuracy_with_dp8_pp2_interleaved():
-        check_loss_accuracy()
-
-
-class TestCaseTrain16GPUWith8DP2PPInterleavedOverlap:
-    """
-    Test cases for Model Training with 16 GPUs.
-    Parallel Config:
-        data parallel size = 8.
-        pipeline parallel size = 2.
-        interleaved scheduler = True.
-        interleaved overlap = True.
-    """
-
-    @staticmethod
-    def setup_class():
-        # update config pipeline parallel size
-        command = f"sed -i 's/^.*pipeline=.*/    pipeline=dict(size=2, interleaved_overlap=True),/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-        command = f"sed -i 's/^.*num_chunks=.*/    num_chunks=2,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-        command = f"sed -i 's/^.*tensor=.*/    tensor=1,/' {CONFIG_FILE_PATH}"
-        subprocess.run(command, shell=True, check=True)
-
-        # model training
-        train()
-
-        # print loss value
-        print(f"cur_loss_list: {cur_loss_list}", flush=True)
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2PP_InterleavedOverlap
-    def test_loss_spike_with_dp8_pp2_interleaved_overlap():
-        check_loss_spike()
-
-    @staticmethod
-    @pytest.mark.training_16GPU_8DP2PP_InterleavedOverlap
-    def test_loss_accuracy_with_dp8_pp2_interleaved_overlap():
-        check_loss_accuracy()
+    check_loss_spike()
+    check_loss_accuracy()