Merge remote-tracking branch 'origin/main' into develop

2023-09-08 10:19:54 +08:00 · 2023-09-08 10:19:54 +08:00 · 0c276d8de2
parent b7a8af8133 671c752de6
commit 0c276d8de2
43 changed files with 2794 additions and 149 deletions
--- a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
@ -0,0 +1,123 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/checkpoint.rst:2 09c8645fba264cdf9a80c4b62c2bb4d1
 msgid "模型保存"
 msgstr "Model Checkpointing"
 #: ../../source/checkpoint.rst:4 8b158d34631045b1afdb4fb0169b3c71
 msgid ""
 "InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` "
 "来管理模型保存。 其中，可以 使用 ``CheckpointManager.try_save_checkpoint(train_state)`` "
 "来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。"
 msgstr ""
 "InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation, "
 "we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports "
 "automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit."
 #: ../../source/checkpoint.rst:8 a023b5a6d15749bfaa51cf2da194bda1
 msgid "Checkpointing"
 msgstr ""
 #: 938575c699d1426c87e0b3f589a85d50
 #: internlm.utils.model_checkpoint.CheckpointManager:1 of
 msgid "StorageManagerContext"
 msgstr ""
 #: 754d6881cd034c5ebaab0f3362dd14c2
 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:1 of
 msgid ""
 "Exit signal detection function, if we write the exit step in the "
 "'QUIT_FILE_PATH' file, all ranks will save ckpt and exit. Negative "
 "integer step means save ckpt. Positive integer step means save ckpt and "
 "quit."
 msgstr ""
 #: 2169f9fb4a8b40bc9bf6093894fc7a5e 6a55d2b2b24a44c8b78b40f19f4d950b
 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training of
 msgid "参数"
 msgstr ""
 #: 360a89b1591e4627ac432f4d75050354
 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
 msgid "返回"
 msgstr ""
 #: 2426832f4a8a4c5481be1c940e0e7b50
 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:9 of
 msgid "whether to quit."
 msgstr ""
 #: 5f6842c261544a3c89f32d981b3ad755
 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
 msgid "返回类型"
 msgstr ""
 #: 1392da84b6e645bcb8dab605e1231fdc
 #: internlm.utils.model_checkpoint.CheckpointManager.wait_async_upload_finish:1
 #: of
 msgid "wait for all checkpoint uploads to be completed"
 msgstr ""
 #: d1774593e9c94608b49b10504bfbc38b
 #: internlm.utils.model_checkpoint.CheckpointManager.query_latest_snapshot_step_boto3:1
 #: of
 msgid ""
 "Returns: Tuple(str, int): path of latest ckpt and ckpt step, if not "
 "found, None will return."
 msgstr ""
 #: a3abbbd2bd574872892d908ab248e804
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:1 of
 msgid "Attempt to restore the training state of the last ckpt."
 msgstr ""
 #: de021d1eb6d54955a2850c11c0191710
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:3 of
 msgid "lr_scheduler object."
 msgstr ""
 #: 20be15854f2e420a9d96c86b5869bfa6
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:5 of
 msgid "optimizer object."
 msgstr ""
 #: 68f69086c5054acc8aca15c8a764acc5
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:7 of
 msgid "learning rate."
 msgstr ""
 #: 5d34d34a972d4abeab4bda3e49ee157b
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:9 of
 msgid "traing states."
 msgstr ""
 #: 82ebb67afaa748ecabc4cef598d7fc30
 #: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:11 of
 msgid "traning dataloader object"
 msgstr ""
 #: 0c95dfcd712749279daca78166bb4326
 #: internlm.utils.model_checkpoint.CheckpointManager.save_checkpoint:1 of
 msgid "Save checkpoint to the given folder path."
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
@ -0,0 +1,50 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/example/30B_demo.rst:2 242d1f89ae2045f1bf1f31bf82f07846
 msgid "30B Demo"
 msgstr ""
 #: ../../source/example/30B_demo.rst:5 c2415bfa6978414a939dcc395fdfb544
 msgid "训练配置"
 msgstr "Training Config"
 #: ../../source/example/30B_demo.rst:7 75f568d1ca5546228f88958c12c2dd65
 msgid "30B demo 训练配置文件样例如下:"
 msgstr "30B demo config file example:"
 #: ../../source/example/30B_demo.rst:164 533cb04f94314eeb8381e45f06d03108
 msgid "启动训练"
 msgstr "Start Training"
 #: ../../source/example/30B_demo.rst:166 24974384d5ab42e68266aeb67ae222ce
 msgid "完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动两节点 16GPU 的训练命令如下所示："
 msgstr "After completing the data preparation and relevant training configurations, you can start the demo training.
 The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs."
 #: ../../source/example/30B_demo.rst:173 948ac71ed53848f9bad07f69d956c4bb
 msgid "训练结果"
 msgstr "Training Results"
 #: ../../source/example/30B_demo.rst:175 615a3481b0aa49729b7219b1365519aa
 msgid "基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下："
 msgstr "Taking the configuration of the demo training on two nodes with 16 GPUs on slurm as an example, the training result log is shown below:"
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
@ -0,0 +1,50 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/example/7B_demo.rst:2 8576f969040249bb93e7c347ef210990
 msgid "7B Demo"
 msgstr ""
 #: ../../source/example/7B_demo.rst:5 5429ceea12424825991744bece744f60
 msgid "训练配置"
 msgstr "Training Config"
 #: ../../source/example/7B_demo.rst:7 c9a47faf5deb40b68ad2bc950fdf2b14
 msgid "7B demo 的训练配置文件样例如下:"
 msgstr "7B demo config file example:"
 #: ../../source/example/7B_demo.rst:162 eb93a6ca05c8421eb87a2470f9f31fc2
 msgid "启动训练"
 msgstr "Start Training"
 #: ../../source/example/7B_demo.rst:164 9e7a864ae2e14d05b0681f16792e5278
 msgid "完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动单节点 8GPU 的训练命令如下所示："
 msgstr "After completing the data preparation and relevant training configurations, you can start the demo training.
 The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs."
 #: ../../source/example/7B_demo.rst:171 fdd053efb1854d46aabf6c0f279fe7fc
 msgid "训练结果"
 msgstr "Training Results"
 #: ../../source/example/7B_demo.rst:173 33ec81f34e3c4340beacdb5254069d08
 msgid "基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下："
 msgstr "Taking the configuration of the demo training on a single machine with 8 GPUs on slurm as an example, the training result log is shown below:"
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
@ -0,0 +1,33 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/example/index.rst:2 de54695e8bde40ffb8878043072197e6
 msgid "训练样例"
 msgstr "Training Example"
 #: ../../source/example/index.rst:5 da388b3209ff4bd39fd0700a7fba413a
 msgid "7B Demo"
 msgstr ""
 #: ../../source/example/index.rst:13 b095e27dfc924a7a943b7cba5361700a
 msgid "30B Demo"
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/index.po
@ -0,0 +1,81 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/index.rst:8 11e029810acf410180311a3c63eb01f4
 msgid "InternLM"
 msgstr "InternLM"
 #: ../../source/index.rst:11 e6fd7d058e4b43bb81157ac79867e3d3
 msgid "环境构建"
 msgstr "Environment Setup"
 #: ../../source/index.rst:19 f323ede90c0f434d8b627eded1d8fc10
 msgid "快速上手"
 msgstr "Quickstart Guide"
 #: ../../source/index.rst:27 3c504b4b92264e9182abb0fa81fe80c3
 msgid "训练构建"
 msgstr "Model Setup"
 #: ../../source/index.rst:35 5cc5c831399a40b089d27b777a776b16
 msgid "训练 API"
 msgstr "Training API"
 #: ../../source/index.rst:43 21a7473eabb441f8bfe28d2a0e306889
 msgid "并行训练"
 msgstr "Parallel Training"
 #: ../../source/index.rst:51 9234725f3c464731993d73607608c874
 msgid "模型备份"
 msgstr "Model Checkpointing"
 #: ../../source/index.rst:59 8e4ce037017f4510b2892a66003877fa
 msgid "性能分析"
 msgstr "Profiler"
 #: ../../source/index.rst:67 a36e02819ecd4b448a8cb4ebbecb6600
 msgid "训练监控"
 msgstr "Monitor"
 #: ../../source/index.rst:75 b912e292486f455c8b5cdd75962e8ac2
 msgid "训练样例"
 msgstr "Example"
 #: ../../source/index.rst:83 ea9e9281720941a1830e5df7a2badf7a
 msgid "常见问题"
 msgstr "Q&A"
 #: ../../source/index.rst:91 e08edc5aa1c74965b10084b393b88fae
 msgid "索引和表格"
 msgstr "Indices and tables"
 #: ../../source/index.rst:93 f3fdca059caa49dcad09aa44be7f02d6
 msgid ":ref:`genindex`"
 msgstr ""
 #: ../../source/index.rst:94 b3791e811315435097bb507edc3f4b9b
 msgid ":ref:`modindex`"
 msgstr ""
 #: ../../source/index.rst:95 a164b772960f4ab8b18c7e8820f69f55
 msgid ":ref:`search`"
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
@ -0,0 +1,228 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 14:15+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: zh_CN\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/initialize.rst:2 b829330eebd24620b745072bbfc26c98
 msgid "训练构建"
 msgstr "Training Setup"
 #: ../../source/initialize.rst:7 8c8472b4647a4de8998d75b9ec6f09eb
 msgid "命令行参数解析"
 msgstr "Argument Parsing"
 #: ../../source/initialize.rst:8 f74176fa4aee4bbfaf989ffab9283ee7
 msgid ""
 "InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_"
 " 库来向InternLM运行时提供命令行参数配置。用户可 使用 "
 "``internlm.initialize.get_default_parser()`` 来获取 InternLM "
 "的默认解析器，其中包含一些内置参数，用户可以向此解析器添加自定义参数。"
 msgstr ""
 "InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline "
 "configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default "
 "parser with some builtin arguments, users can add custom parameters to this parser."
 #: 9930855b85bf41ed8712fc40e1e034f7
 #: internlm.initialize.launch.get_default_parser:1 of
 msgid ""
 "Reads user command line and uses an argument parser to parse the input "
 "arguments. Input arguments include configuration, host, port, world size,"
 " local rank, backend for torch.distributed."
 msgstr ""
 #: 015003b013e346bea15b4514f2001a25 544472c2ce3c43bfb59317083c6b55c9
 #: 7ee60ba1a92a4b9e8174049fb498a4f0 bca7c66f1a5a4517958bcea1e09d5d10
 #: f5cbe452ae694c7884ac4596a7735bf6
 #: internlm.initialize.initialize_trainer.initialize_trainer
 #: internlm.initialize.launch.get_default_parser
 #: internlm.train.training_internlm.get_train_data_loader
 #: internlm.train.training_internlm.initialize_model
 #: internlm.train.training_internlm.initialize_optimizer of
 msgid "返回"
 msgstr ""
 #: 9b04c3d6b98b44ee89f800b71e8d80a9
 #: internlm.initialize.launch.get_default_parser:4 of
 msgid ""
 "Returns the parser with the default arguments, the user may add "
 "customized arguments into this parser."
 msgstr ""
 #: 147005b197e64c4b9a96a7cfe78045bc 3634f79c9aa547a48eb3fd7f150deb51
 #: d3f0aa4143c84b719cd0b53170dd86c1
 #: internlm.initialize.initialize_trainer.initialize_trainer
 #: internlm.initialize.launch.get_default_parser
 #: internlm.train.training_internlm.initialize_model of
 msgid "返回类型"
 msgstr ""
 #: ../../source/initialize.rst:25 db2bf9d3ff81483dbf218e63dd4bbbe4
 msgid "模型初始化"
 msgstr "Model Initialization"
 #: 5c2e33e254d4495fbc4b0226aac1fddb
 #: internlm.train.training_internlm.initialize_model:1 of
 msgid "Initialize model with Automatic Mixed Precision."
 msgstr ""
 #: c1254615508542b680daf73374844f9e
 #: internlm.train.training_internlm.initialize_model:3 of
 msgid "The neural network model to be trained or evaluated."
 msgstr ""
 #: ../../source/initialize.rst:29 b9867771b9da40cd8f3a55ee5ab95f65
 msgid "InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下："
 msgstr ""
 "InternLM uses the field ``model_type`` and ``model`` in the config file "
 "to control model initialization process. An example model initialization "
 "configuratio"
 #: ../../source/initialize.rst:57 984a38d7f63949ecbb0d8b2ef3459d57
 msgid "字段 ``model_type`` 指明了要初始化的模型类型"
 msgstr ""
 "The field ``model_type`` specifics the model type has been registered and"
 " to be initialized."
 #: ../../source/initialize.rst:58 9f04ad0f145f4e40bc75a3ef45c7a59d
 msgid "字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置"
 msgstr ""
 "The parameters in field ``model`` specific the configuration settings "
 "during model initialization."
 #: ../../source/initialize.rst:60 d7780e355bb6429bb5151d9a0e6d7e36
 msgid ""
 "值得注意的是，用户可以定义新的模型类型，并使用装饰器 ``@MODEL_INITIALIZER.register_module`` "
 "注册模型的初始化函数，其中 ``MODEL_INITIALIZER`` 是类 "
 "``internlm.util.registry.Registry`` 的一个实例化对象，示例如下所示："
 msgstr ""
 "It is worth noting that, users can define new model type, and register "
 "model's initialization function by decorater "
 "``@MODEL_INITIALIZER.register_module``, which ``MODEL_INITIALIZER`` is an"
 " instantiated object of class ``internlm.util.registry.Registry``, the "
 "example is shown as follows."
 #: ../../source/initialize.rst:72 d863f71b208a49a09d2d00537e331962
 msgid "优化器初始化"
 msgstr "Optimizer Initialization"
 #: acaafdc9bb96434bbd42a98f74187db1
 #: internlm.train.training_internlm.initialize_optimizer:1 of
 msgid "Initialize optimizer."
 msgstr ""
 #: 62fc4215c9a44bda8b31c933db90f270 93c398e44f6a4f708ba064250a3d253c
 #: e2bebdd751724915a65dec444bb89e25
 #: internlm.initialize.initialize_trainer.initialize_trainer
 #: internlm.train.training_internlm.get_train_data_loader
 #: internlm.train.training_internlm.initialize_optimizer of
 msgid "参数"
 msgstr ""
 #: 2033ee96ded8423a80268b337ba9549c
 #: internlm.train.training_internlm.initialize_optimizer:3 of
 msgid "Your model instance to be trained or evaluated."
 msgstr ""
 #: df01b44c724b4326a6c85b44694262ba
 #: internlm.train.training_internlm.initialize_optimizer:6 of
 msgid "A tuple of (optimizer, beta2_scheduler, lr_scheduler)."
 msgstr ""
 #: ../../source/initialize.rst:79 0b46b890048f4758a9d56e0540759d9f
 msgid "数据加载器初始化"
 msgstr "Dataloader Initialization"
 #: 58e39b26ab4849788e792df386f01d7e
 #: internlm.train.training_internlm.get_train_data_loader:1 of
 msgid "Generate and return the training data loader."
 msgstr ""
 #: 37a91c167e0b4e5fad4edcc3caf0d012
 #: internlm.train.training_internlm.get_train_data_loader:3 of
 msgid "number of subprocesses used for dataloader."
 msgstr ""
 #: 947aba2a4f86420d9b2660425a6043cc
 #: internlm.train.training_internlm.get_train_data_loader:5 of
 msgid "generate function for dataset."
 msgstr ""
 #: 8a8f5ee665cb4e15bc33194c0b1f346c
 #: internlm.train.training_internlm.get_train_data_loader:7 of
 msgid "dataset sampler for training dataloader."
 msgstr ""
 #: 4c3e1e896e7940bf97c124909d2e7f36
 #: internlm.train.training_internlm.get_train_data_loader:9 of
 msgid "collate function for training dataloader."
 msgstr ""
 #: d9f0740d048c48888e82c8f8a78e33cd
 #: internlm.train.training_internlm.get_train_data_loader:12 of
 msgid "A tuple of (train_dl, dataset_types)."
 msgstr ""
 #: ../../source/initialize.rst:86 1c4df708ff5c47f6abae32617bf2ed31
 msgid "Trainer 初始化"
 msgstr "Trainer Initialization"
 #: d535583dbcb245499e19c09f3f8b534a
 #: internlm.initialize.initialize_trainer.initialize_trainer:1 of
 msgid ""
 "Core function to wrap the essential training components with our "
 "functionality based on the config which is loaded into gpc.config."
 msgstr ""
 #: 3e370234e4b245e4b9cae1fe235df8ff
 #: internlm.initialize.initialize_trainer.initialize_trainer:4 of
 msgid "Your model instance or a function to build the model."
 msgstr ""
 #: b716a4a264234011a7b51fa12e575651
 #: internlm.initialize.initialize_trainer.initialize_trainer:6 of
 msgid "Your optimizer for training."
 msgstr ""
 #: 6a54ce9d516f4f14bab281c9db9816e8
 #: internlm.initialize.initialize_trainer.initialize_trainer:8 of
 msgid "Your criterion instance."
 msgstr ""
 #: ff9dfd04d31b4dc6afbdd841829b4c33
 #: internlm.initialize.initialize_trainer.initialize_trainer:10 of
 msgid "Dataloader for training."
 msgstr ""
 #: de345f9a457a4a88bf60b4ee96535e31
 #: internlm.initialize.initialize_trainer.initialize_trainer:12 of
 msgid "Dataloader for testing."
 msgstr ""
 #: 64e646b25420424d9dcdfb1ad7de5e6f
 #: internlm.initialize.initialize_trainer.initialize_trainer:14 of
 msgid "Your lr scheduler instance, optional."
 msgstr ""
 #: 39c7132bfafe4e22ae373081fee711ce
 #: internlm.initialize.initialize_trainer.initialize_trainer:17 of
 msgid ""
 "A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``"
 " where only ``trainer`` could not be None."
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/install.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/install.po
@ -0,0 +1,140 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../../install.md:2 ../../../install.md:28
 #: c237a7328df9440eb54f36c5e6ceef46 e55787faf3f74d5996f251b28422cf15
 msgid "环境安装"
 msgstr "Installation"
 #: ../../../install.md:4 d5cd61481eb04f55a9b1636e47e2bc49
 msgid "环境准备"
 msgstr "Environment Preparation"
 #: ../../../install.md:5 418763cd4acb4ff3afba059ae7066739
 msgid "首先，需要安装的依赖包及对应版本列表如下："
 msgstr "The required packages and corresponding version are shown as follows:"
 #: ../../../install.md:6 dcb95218036f4452a92a5a9c2fdbe337
 msgid "Python == 3.10"
 msgstr ""
 #: ../../../install.md:7 79e3d9ff5df7455fa596ba63ce3089b7
 msgid "GCC == 10.2.0"
 msgstr ""
 #: ../../../install.md:8 d14840f7b64d4a32a0be5762027e9c32
 msgid "MPFR == 4.1.0"
 msgstr ""
 #: ../../../install.md:9 851e3e5c874a4d0f8fd37a4f85ec8f2f
 msgid "CUDA >= 11.7"
 msgstr ""
 #: ../../../install.md:10 dbf2012c72e1479ba6647baa047ecc04
 msgid "Pytorch >= 1.13.1"
 msgstr ""
 #: ../../../install.md:11 b191e289a079455ea906694a75439b3e
 msgid "Transformers >= 4.28.0"
 msgstr ""
 #: ../../../install.md:12 17accf19fe184e3cb704274d8a66e87e
 msgid "Flash-Attention >= v1.0.5"
 msgstr ""
 #: ../../../install.md:13 8063cdce4bb94947a07dbaedd97e1013
 msgid "Apex == 23.05"
 msgstr ""
 #: ../../../install.md:14 7d6d2682ed214d0cba0048903c128bce
 msgid "Ampere或者Hopper架构的GPU (例如H100, A100)"
 msgstr "GPU with Ampere or Hopper architecture (such as H100, A100)"
 #: ../../../install.md:15 91039fb42b94421586c558a2afcbed71
 msgid "Linux OS"
 msgstr ""
 #: ../../../install.md:17 694b95a146d54878a4a5d57e0c1e8c6c
 msgid "以上依赖包安装完成后，需要更新配置系统环境变量："
 msgstr "After installing the above dependencies, some system environment variables need to be updated:"
 #: ../../../install.md:29 d0ebf84438dc43708ea517c7eff92e79
 msgid "将项目`internlm`及其依赖子模块，从 github 仓库中 clone 下来，命令如下："
 msgstr "Clone the project `internlm` and its dependent submodules from the github repository, as follows:"
 #: ../../../install.md:34 c278177fc1974f3fac9b33688d0591fd
 msgid "推荐使用 conda 构建一个 Python-3.10 的虚拟环境， 并基于`requirements/`文件安装项目所需的依赖包："
 msgstr "It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:"
 #: ../../../install.md:43 6a152c8e332f47b0ba35a9bcec2ed32d
 msgid "安装 flash-attention (version v1.0.5)："
 msgstr "Install flash-attention (version v1.0.5):"
 #: ../../../install.md:55 d7b2116e6ca745ceb48a792fae371283
 msgid "安装 Apex (version 23.05)："
 msgstr "Install Apex (version 23.05):"
 #: ../../../install.md:62 8bcbfb9f74de4a2796212a339feb8283
 msgid "环境镜像"
 msgstr "Environment Image"
 #: ../../../install.md:63 6cbb97568d704cf19e7dabab20ce1d5b
 msgid ""
 "用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 "
 "https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。"
 msgstr "Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm."
 #: ../../../install.md:65 9c29ae2ac9984a8094daf52751f5c7b9
 msgid "镜像配置及构造"
 msgstr "Image Configuration and Build"
 #: ../../../install.md:66 12bd6b0729464cb5af663a384dadd0ec
 msgid ""
 "dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build "
 "镜像："
 msgstr "The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:"
 #: ../../../install.md:70 b5f42dbca3e340c4bb80de1f502e0700
 msgid ""
 "在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 "
 "ubuntu20.04 和 centos7。"
 msgstr "In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported."
 #: ../../../install.md:72 4abb47ce9cf64b3c9b8dc23ace37a826
 msgid "镜像拉取"
 msgstr "Pull Standard Image"
 #: ../../../install.md:73 1b6e61b2e0cb4da98f5d70d67ac638f9
 msgid "基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用："
 msgstr "The standard image based on ubuntu and centos has been built and can be directly pulled:"
 #: ../../../install.md:82 2bd75cc4b74848c19775e2b1c83726c1
 msgid "容器启动"
 msgstr "Run Container"
 #: ../../../install.md:83 4bb2dd4bba904255a204776a50721159
 msgid "对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器："
 msgstr "For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:"
 #: ../../../install.md:87 66613606256e4094a6be5ab2af1269ae
 msgid "容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。"
 msgstr "The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md)."
--- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
@ -0,0 +1,198 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/monitor.rst:2 f95ef3bff8574c77a28ca2f6212cc4b8
 msgid "监控和告警"
 msgstr "Monitor and Alert"
 #: ../../source/monitor.rst:5 959bd4a6061f4483875c7950ab4546cf
 msgid "监控"
 msgstr "Monitoring"
 #: ../../source/monitor.rst:7 6071bc878d894865b73380cb887847c1
 msgid ""
 "InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` "
 "来初始化上下文监控管理。其中，一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` "
 "将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。"
 msgstr ""
 "InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` to initialize context monitor. During this time, "
 "a singleton ``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread and track training status "
 "with ``internlm.monitor.monitor.MonitorTracker``."
 #: 9256a063b6dd449786f29e03ce085176
 #: internlm.monitor.monitor.initialize_monitor_manager:1 of
 msgid ""
 "Initialize monitor manager for monitoring training lifetime and alerting "
 "exception info to Feishu."
 msgstr ""
 #: 138340fca72a4226be901f7f16c8a590 904b7938fdea46bf81c1ef738aa7bfae
 #: 9ed2a7b4af2243b289e72b2751aec902 aa0dd0dc6bee4a5bb15cc9705f7c13ee
 #: internlm.monitor.alert.send_feishu_msg_with_webhook
 #: internlm.monitor.monitor.MonitorManager.start_monitor
 #: internlm.monitor.monitor.MonitorTracker
 #: internlm.monitor.monitor.initialize_monitor_manager of
 msgid "参数"
 msgstr ""
 #: 3b302339e1d143b6b1d782ff59c9396d 6a06f053828b4c80aef56970750e2085
 #: internlm.monitor.monitor.MonitorManager.start_monitor:3
 #: internlm.monitor.monitor.initialize_monitor_manager:3 of
 msgid "The training job name."
 msgstr ""
 #: 3330d06145ee4d35b0b3632e799a35b3 c105473f2f6a4f838a9f0d098762d698
 #: internlm.monitor.monitor.MonitorManager.start_monitor:5
 #: internlm.monitor.monitor.initialize_monitor_manager:5 of
 msgid "The Feishu webhook address for sending alert messages."
 msgstr ""
 #: 774c6ff82a2e452295a1a7dcabaded3d internlm.monitor.monitor.MonitorManager:1
 #: of
 msgid ""
 "Monitor Manager for managing monitor thread and monitoring training "
 "status."
 msgstr ""
 #: 72e696c0ce8f41ea8c7947d35cf322f0
 #: internlm.monitor.monitor.MonitorManager.monitor_loss_spike:1 of
 msgid "Check loss value, if loss spike occurs, send alert message to Feishu."
 msgstr ""
 #: 2b668b057fa84e8b92c65bfd49bfb3e9
 #: internlm.monitor.monitor.MonitorManager.monitor_exception:1 of
 msgid "Catch and format exception information, send alert message to Feishu."
 msgstr ""
 #: 9852b7143026476d89e1a175223e6d79
 #: internlm.monitor.monitor.MonitorManager.handle_sigterm:1 of
 msgid "Catch SIGTERM signal, and send alert message to Feishu."
 msgstr ""
 #: 2e3827bad7b1445fb0d9a7c5a28def5d
 #: internlm.monitor.monitor.MonitorManager.start_monitor:1 of
 msgid ""
 "Initialize and start monitor thread for checking training job status, "
 "loss spike and so on."
 msgstr ""
 #: 271cc3e1b0834a7ba6a1ba4d5cce0ef1
 #: internlm.monitor.monitor.MonitorManager.start_monitor:7 of
 msgid "The time of monitor interval in seconds, defaults to 300."
 msgstr ""
 #: e4a06091fce8401b83e31ce26c8075a0
 #: internlm.monitor.monitor.MonitorManager.start_monitor:9 of
 msgid ""
 "The limit multiple of current loss to previous loss value, which means "
 "loss spike may be occurs, defaults to 1.5."
 msgstr ""
 #: 28bde748477e41f39fa6ca3e1855923d
 #: internlm.monitor.monitor.MonitorManager.stop_monitor:1 of
 msgid "Stop the monitor and alert thread."
 msgstr ""
 #: ffb3dda227664748bdb326b6630bc827 internlm.monitor.monitor.MonitorTracker:1
 #: of
 msgid "Track job status and alert to Feishu during job training."
 msgstr ""
 #: a1e93683cbb04d8ab825e2776e76efa7 internlm.monitor.monitor.MonitorTracker:3
 #: of
 msgid "The Feishu webhook address for sending alerting messages."
 msgstr ""
 #: 7913eeecc0904c128046e80cec1553f2 internlm.monitor.monitor.MonitorTracker:5
 #: of
 msgid "The interval in seconds for monitoring checks. Defaults to 300."
 msgstr ""
 #: 8d1abc3067584866983139dd3d85c59c internlm.monitor.monitor.MonitorTracker:7
 #: of
 msgid "The threshold for detecting loss value spikes. Defaults to 1.5."
 msgstr ""
 #: a0416fd68700450793daa2167f776618
 #: internlm.monitor.monitor.MonitorTracker.run:1 of
 msgid "start the monitor tracker."
 msgstr ""
 #: f55eb990c07b4e8f9388236dd60f0017
 #: internlm.monitor.monitor.MonitorTracker.stop:1 of
 msgid "Stop the monitor tracker."
 msgstr ""
 #: ../../source/monitor.rst:18 2202bc091aab417097a1b0268dfe6785
 msgid "告警"
 msgstr "Alerting"
 #: ../../source/monitor.rst:20 69334f83e644455aa619dde70b8ed1f2
 msgid ""
 "InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等，并捕获 "
 "SIGTERM 异常信号。当出现上述情况时，将触发警报，并通过调用 "
 "``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook "
 "地址发送报警消息。"
 msgstr ""
 "InternLM monitor thread periodically tracks loss spike, potential stuck condition, runtime exception, and SIGTERM signal. "
 "When above situation occurs, an alert will be triggered and a message will be sent to the Feishu webhook address by calling "
 "``internlm.monitor.alert.send_feishu_msg_with_webhook()``."
 #: 15980526c2fa4ed8befa1604f271a3f1
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:1 of
 msgid "Use Feishu robot to send messages with the given webhook."
 msgstr ""
 #: 38e5738c2b914c8096e1a0f345e6c0b4
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:3 of
 msgid "The webhook to be used to send message."
 msgstr ""
 #: 4984f1a3bb0d46b48b2aad4fba8b43d9
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:5 of
 msgid "The message title."
 msgstr ""
 #: a9822a4cf30d4947b12f70a0efe62a5e
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:7 of
 msgid "The message body."
 msgstr ""
 #: 57d9ab65fe9f45c28351839fecf2f31e
 #: internlm.monitor.alert.send_feishu_msg_with_webhook of
 msgid "返回"
 msgstr ""
 #: 2b6ac97fd152498183a8624a9087812b
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:10 of
 msgid "The response from the request. Or catch the exception and return None."
 msgstr ""
 #: ec45dedf976046eb909f5b7f79a7d44c
 #: internlm.monitor.alert.send_feishu_msg_with_webhook of
 msgid "抛出"
 msgstr ""
 #: 4c6aeec19a6041cfbfa577b1c5a85ac1
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:12 of
 msgid "An exception rasied by the HTTP post request."
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
@ -0,0 +1,457 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/parallel.rst:2 28d82a05db464e35aa3ec83e36597214
 msgid "并行训练"
 msgstr "Parallel Training"
 #: ../../source/parallel.rst:6 f5c2eef4812640fca0aeaef62a2d85d4
 msgid ""
 "InternLM 支持张量并行、流水线并行、序列并行、数据并行和 ZeRO1.5 "
 "等并行化训练策略。在初始化分布式环境时，我们需要指定张量并行大小、流水线并行大小、数据并行大小以及 ZeRO1.5 策略。"
 msgstr ""
 "InternLM supports tensor parallel, pipeline parallel, sequence parallel, data parallel, and ZeRO1.5 "
 "to parallelize the training pipeline. When initializing the distributed environment, we need to specify "
 "tensor parallel size, pipeline parallel size, data parallel size, and ZeRO1.5 strategy."
 #: ../../source/parallel.rst:8 649c52696a734a0c86d3d5377193aba5
 msgid ""
 "InternLM 的并行设置由配置文件中的 ``parallel`` 字段指定，用户可以通过修改配置文件 `config file "
 "<https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_ "
 "来更改并行配置。以下是一个并行训练配置示例："
 msgstr ""
 "The parallel setting of InternLM is fully config-driven, and you can change the parallelism by modifying "
 "`config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_. An exmaple parallel "
 "training configuration can be defined as follows:"
 #: ../../source/parallel.rst:19 a06ae11e51ea479b9501ada103c9d071
 msgid "zero1：zero 并行策略，分如下三种情况，默认值为 -1"
 msgstr "zero1: zero parallel strategy, divided into the following three cases, the default value is -1"
 #: ../../source/parallel.rst:21 08005d5cdde84057b870495d9683c7be
 msgid "当 ``zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
 msgstr "When ``zero1 <= 0``, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range."
 #: ../../source/parallel.rst:22 fe30803c0aec4b70847ac40b68641e05
 msgid "当 ``zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
 msgstr "When ``zero1 == 1``, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters."
 #: ../../source/parallel.rst:23 e0acea7d80094e018fab75404ec25163
 msgid ""
 "当 ``zero1 > 1`` 且 ``zero1 <= data_parallel_world_size``，则 zero1 "
 "进程组是数据并行进程组的子集"
 msgstr "When ``zero1 > 1`` and ``zero1 <= data_parallel_world_size``, the zero1 process group is a subset of the data parallel process group."
 #: ../../source/parallel.rst:25 17bba79e2e884993a602df9cf20d2489
 msgid "tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1"
 msgstr "tensor: tensor parallel size, usually the number of GPUs per node, the default value is 1"
 #: ../../source/parallel.rst:26 3bda721a03a144f28f33d360a87cbf83
 msgid "pipeline：流水线并行策略"
 msgstr "pipeline: pipeline parallel strategy"
 #: ../../source/parallel.rst:28 2b10f2b57ef64fcc872d036a7ad82b03
 msgid "size：流水线并行大小，默认值为 1"
 msgstr "size: pipeline parallel size, the default value is 1"
 #: ../../source/parallel.rst:29 49c8a409e60244c49514a27780ae39a3
 msgid "interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为 False"
 msgstr "interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False"
 #: ../../source/parallel.rst:31 e4ff81960c434b78847174787f0423e2
 msgid "sequence_parallel：是否开启序列化并行，默认值为 False"
 msgstr "sequence_parallel: whether to enable sequence parallelism, the default value is False"
 #: ../../source/parallel.rst:33 a24f4bc81fea48619ae2720e0cb6a392
 msgid "注意：数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小"
 msgstr "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`"
 #: ../../source/parallel.rst:36 a93fc45f855c4ca7901ccbe23bf14edc
 msgid "张量并行"
 msgstr "Tensor Parallel"
 #: ../../source/parallel.rst:38 cce9e8f3c8f14c1c96c63273baceb164
 msgid ""
 "InternLM 的张量并行实现方案基于 `flash attention <https://github.com/Dao-AILab"
 "/flash-attention>`_, 主要对 `attention "
 "<https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_"
 " 和 `linear "
 "<https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_"
 " 这两个模块进行张量并行操作。"
 msgstr ""
 "The implementation of tensor parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_, "
 "which has tensor parallel extensions to parallelize `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ "
 "and `linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ blocks in InternLM model. "
 #: ../../source/parallel.rst:41 f98a4b36ffdf4381a03899b605346be6
 msgid "用户可通过配置文件中的 ``parallel.tensor`` 字段来设置张量并行大小。"
 msgstr "To use tensor parallel, you need to set the value of tensor parallel size ``parallel.tensor`` in the config file, which is usually the number of GPUs per node."
 #: ../../source/parallel.rst:47 956804e7cde441989212f7eb505e8815
 msgid "张量并行，采用自 `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_"
 msgstr "Tensor parallel, adopted from `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_"
 #: ../../source/parallel.rst:50 a6424fd0ff0246fcadf56436260fadb6
 msgid "流水线并行"
 msgstr "Pipeline Parallel"
 #: ../../source/parallel.rst:52 f2c163418fed432a8f3f59f1a5229e88
 msgid ""
 "InternLM 在流水线并行中使用 `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ "
 "（1F1B，一次前向传递后跟一次反向传递）策略。对于 1F1B 策略，有两种实现方式："
 msgstr "InternLM uses `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ (one forward pass followed by one backward pass) for pipeline parallel. For 1F1B strategy, there are two implementations:"
 #: ../../source/parallel.rst:54 43f3b988e2924fe9968b9d049b46ffa0
 msgid "非交错调度器，内存高效。"
 msgstr "non-interleaved scheduler, which is memory-efficient"
 #: ../../source/parallel.rst:55 7a45446082c441d48d49b6be661ea8d2
 msgid "交错调度器，内存高效且时间高效（GPU空泡较少）。"
 msgstr "interleaved scheduler, which is both memory-efficient and time-efficient."
 #: ../../source/parallel.rst:61 92f2a168d7794811b56f9bb3bc170982
 msgid "1F1B 流水线并行调度器，采用自 `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_"
 msgstr "Non-interleaved and interleaved scheduler for 1F1B pipeline parallelism, adopted from `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_"
 #: ../../source/parallel.rst:64 a6d3df0b74b14b158a04ddda3e904004
 msgid "非交错式流水线调度"
 msgstr "scheduler for non-interleaved 1F1B strategy"
 #: ../../source/parallel.rst:65 1fa48743f39a44a29d78fb7f9eed5a52
 msgid "如果要使用非交错式调度, 需要设置 ``model.num_chunks = 1``。"
 msgstr "To use non-interleaved pipeline scheduler, users need to set ``model.num_chunks = 1`` in the config file."
 #: 57206dc0bc734686841c363c88839708
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:1 of
 msgid ""
 "A helper schedule class for pipeline parallelism running environment. It "
 "uses non-interleaved 1F1B strategy. Other properties are similar as "
 ":class:`NonPipelineSchedule`."
 msgstr ""
 #: 6475fee6f3cd462ba1073a641b322e12 7060a021efb0459598f49f74e8e7185b
 #: 9218fee47e5542cab88ac65ff0054068 d1be8d5479fb48f59be379548ee24bd9
 #: d41da940b4a84cd0822c3f94c2eaf344 f5654fe6eacc49dba5baa1d058df5d29
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad of
 msgid "参数"
 msgstr ""
 #: 567e2a87a45245469af9f8709e020a20
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:5 of
 msgid "The number of microbatches."
 msgstr ""
 #: 6d3b2256ea9c4897bf72f551f8b4696b
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:7 of
 msgid "Type of data. torch.float by default."
 msgstr ""
 #: 6e36198f5ed344f7ad02f56aec9a333c
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:9 of
 msgid ""
 "The post processing function which receives a micro batch of data, and it"
 " will be executed in `load_micro_batch`."
 msgstr ""
 #: ffae9611bd854615af1ced927f72c556
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:12 of
 msgid "Specified shape in pipeline communication."
 msgstr ""
 #: 31d45af550334cb8a94142da335b9724
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:14 of
 msgid ""
 "If set to `True`, communication will be reduced over pipeline when using "
 "1D tensor parallelization."
 msgstr ""
 #: 5c852dc7866f4e50ab87c15b86d338f2
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:16 of
 msgid "List of scheduler hooks."
 msgstr ""
 #: 4ebec38a972b4c31a59f1fc824d51f62
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing:1
 #: of
 msgid "To perform actions before running the schedule."
 msgstr ""
 #: d491d0dfa1bf41708150cc57567ac0f0
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing:3
 #: of
 msgid "InternLM engine for training and inference."
 msgstr ""
 #: bc5dc62440b94825b192ad2e28641976
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:1
 #: of
 msgid ""
 "Runs non-interleaved 1F1B schedule, with communication between pipeline "
 "stages. Returns a tuple with losses if the last stage, an empty tuple "
 "otherwise."
 msgstr ""
 #: 765809e448b644678a9fb822f6427a94 99c948f562e343aabdecac2d43650f59
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:4
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:4
 #: of
 msgid "Colossalai engine for training and inference."
 msgstr ""
 #: 31af7a46c5a645628bea05ad35757dcf 4ea88ec52c5b4df79a57ab2d217de697
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:6
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:6
 #: of
 msgid ""
 "Dataloader as the form of an iterator, obtained by calling "
 "iter(dataloader)."
 msgstr ""
 #: 2deff747718449fabc5b47a1de0be52e e0d2e154ac134da28470924aa65342a1
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:8
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:8
 #: of
 msgid ""
 "Whether run forward step only. Default is false. If true, no backward "
 "will be run."
 msgstr ""
 #: 71aa2b45248c4af28525dbc1ba4a1aff d3b3c1e350334dd2a16cbb2e8c8d339a
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:10
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:10
 #: of
 msgid "Whether returns the loss value. Default is true."
 msgstr ""
 #: 2021eaca687148539b03f6b0b1c118c8 5c138015fb254eccae2f0df2dab45629
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:12
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:12
 #: of
 msgid "If False, the output and label won't be returned."
 msgstr ""
 #: 57a86115b88541b1a7220d9535058607 5dabcd12b6d844aab8039b022ad0cf1c
 #: b8ccfee837a242a3abbdf9e15eaa53d8
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step of
 msgid "返回"
 msgstr ""
 #: 7dc47f5518e64d1095a6051184985f17 fe678c953e8149a5ade387e95d10d3b2
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:17
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:15
 #: of
 msgid "A tuple of (output, label, loss), loss and label could be None."
 msgstr ""
 #: a50c7c3d40e14ba8a5af06aa0cb031cb ea3574b76d604402a41fcd3874d05c9a
 #: fa12b183c7534a20b61445eb9f2a2a7a
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step of
 msgid "返回类型"
 msgstr ""
 #: 82936eed6da5408c9361732f8fd5cb93 c46a28c21ca149d98ff625b7fdad4c03
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:19
 #: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:16
 #: of
 msgid "Tuple[:class:`torch.Tensor`]"
 msgstr ""
 #: ../../source/parallel.rst:71 d2bfdbbd9a7641c38e6957a72ac6bc97
 msgid "交错式流水线调度"
 msgstr "scheduler for interleaved 1F1B strategy"
 #: ../../source/parallel.rst:72 395c484fef984a65a284147dc3056241
 msgid "如果要使用交错式调度, 需要设置 ``model.num_chunks > 1``。"
 msgstr "To use interleaved pipeline scheduler, users need to set ``model.num_chunks > 1`` in the config file."
 #: 036fffe3aacc4400af38ce5252840a50
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler:1 of
 msgid "Interleaved Pipeline Scheduler."
 msgstr ""
 #: 1b6e63b4004e44999e3ad38382b4e308
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:1
 #: of
 msgid ""
 "Run interleaved 1F1B schedule (model split into model chunks), with "
 "communication between pipeline stages as needed."
 msgstr ""
 #: 6ece1dfcdb5e408db4870d6c0f524787
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:15
 #: of
 msgid ""
 "A tuple of (output, label, loss), loss and label could be None.     The "
 "loss would be returned only in the last stage."
 msgstr ""
 #: ed7e5a4826f84e9eb2840e494761437f
 #: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:18
 #: of
 msgid "The loss would be returned only in the last stage."
 msgstr ""
 #: ../../source/parallel.rst:77 1b771fea1d434f0b8b118f1b5344dde4
 msgid "值得注意的是，在使用交错式流水线调度器时可启用通信优化功能，即在 1F1B 阶段启用异步通信，以充分利用上行/下行带宽并实现通信与计算重叠。"
 msgstr "Asynchronous communication will be enabled in 1F1B stage to make full use of uplink/downlink bandwidth and achieve communication overlap. "
 #: ../../source/parallel.rst:79 27430e179b454d48a052b9fe6e11ecae
 msgid ""
 "用户需要在配置文件中设置 ``parallel.pipeline.interleaved_overlap = "
 "True``。该功能启用后，将调用函数 "
 "``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap``，并创建 "
 "``internlm.core.communication.AsynCommunicator`` 以管理异步通信。"
 msgstr ""
 "When ``parallel.pipeline.interleaved_overlap = True``, function ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap`` will be called and "
 "``internlm.core.communication.AsynCommunicator`` will be created for managing async communication."
 #: ../../source/parallel.rst:81 4e0b6269ca48430098ed4619d0f0f22f
 msgid "``1F1B-without-overlap`` 和 ``1F1B-with-overlap`` 的区别如下所示："
 msgstr "The difference between 1F1B stage without overlap and 1F1B stage with overlap is shown as follows:"
 #: ../../source/parallel.rst:102 8412b1f6f51c479d9cbb281763215327
 msgid "序列并行"
 msgstr "Sequence Parallel"
 #: ../../source/parallel.rst:104 45aea8164dd244e5a730881c693eeecf
 msgid ""
 "序列并行是一种在不引入额外计算、通信和内存开销的情况下，减少层 ``layer_norm`` 和 ``dropout`` "
 "操作中的激活值内存。InternLM 中的序列并行实现基于 `flash attention <https://github.com/Dao-"
 "AILab/flash-attention>`_。这个并行策略有助于降低模型的内存消耗，提高了模型在资源受限环境中的可扩展性。"
 msgstr ""
 "Sequence parallel is a technique to reduce activation memory in layer norm and dropout without additional computation, "
 "communication or memory overhead. The implementation of sequence parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_. "
 #: ../../source/parallel.rst:106 29836b441ee84df6a6dbe877930ba911
 msgid "如果要启用序列并行, 用户需要设置 ``parallel.sequence_parallel = True``。"
 msgstr "To enable sequence parallel, you need to set ``parallel.sequence_parallel = True`` in the config file."
 #: ../../source/parallel.rst:112 eadcd6e77c2547998b4e132939a15856
 msgid "序列并行, 采用自 flash-attention"
 msgstr "Sequence parallel, adopted from flash-attention"
 #: ../../source/parallel.rst:115 47a0ac84251949fab0d9d8d34efb8751
 msgid "数据并行"
 msgstr "Data Parallel"
 #: ../../source/parallel.rst:117 938ad5a1cbc846bab36e8d2f4804a685
 msgid "InternLM 支持数据并行。数据并行大小为:"
 msgstr "InternLM supports data parallel. For data parallel:"
 #: ../../source/parallel.rst:119 1e8691a5ff4a4b40ae24815c681f7306
 msgid ""
 "`Data parallel size = Total number of GPUs / Pipeline parallel size / "
 "Tensor parallel size`"
 msgstr ""
 #: ../../source/parallel.rst:122 c417e2af4e8e45ca8ca18ad39e96dadd
 msgid "ZeRO1.5"
 msgstr ""
 #: ../../source/parallel.rst:124 9c05b4baf8a04e4b8a0f204c4e30cc9c
 msgid ""
 "ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zero1`` "
 "启用了本地节点内的分片。这个方法有助于有效管理和分配模型参数和梯度，以减少内存使用并提高训练效率。"
 msgstr "The implementation of ZeRO1.5 uses the concept of hierarchical sharding via config value ``parallel.zero1``, which enables sharding within local nodes."
 #: ../../source/parallel.rst:126 48c994fe37d54c35bbf81f4be070e151
 msgid "当 ``parallel.zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
 msgstr "If ``parallel.zero1 <= 0``, the size of the zero process group is equal to the size of the dp process group, so parameters will be divided within the range of dp."
 #: ../../source/parallel.rst:127 3d31193758e24a08b1e90eae21259f71
 msgid "当 ``parallel.zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
 msgstr "If ``parallel.zero1 == 1``, zero is not used, and all dp groups retain the full amount of model parameters."
 #: ../../source/parallel.rst:128 fb5c43d2ac75423cabc12ba1512df25e
 msgid ""
 "当 ``parallel.zero1 > 1`` 且 ``parallel.zero1 <= "
 "data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集"
 msgstr "If ``parallel.zero1 > 1`` and ``parallel.zero1 <= dp world size``, the world size of zero is a subset of dp world size. For smaller models, it is usually a better choice to split the parameters within nodes with a setting ``parallel.zero1 <= 8``."
 #: ../../source/parallel.rst:130 47f03cea956a4477854591363359cdb3
 msgid ""
 "此外，用户可以在配置文件中通过 ``hybrid_zero_optimizer`` "
 "字段启用优化器的通信优化功能，设置桶大小，以及梯度剪裁等参数。这些设置有助于优化训练过程中的通信和计算效率，以及梯度的处理方式。"
 msgstr "Furthermore, you can enable communication-computation overlap, set bucket reduce size, gradient clipping parameters in the config file."
 #: ../../source/parallel.rst:144 dfc63103d4e341ccb7df8ef031e29f4e
 msgid "这里有两个值得关注的通信优化点："
 msgstr "There are two communication optimizations worth paying attention to here:"
 #: ../../source/parallel.rst:146 e4815f887d8f48368be01339b5e64d18
 msgid ""
 "overlap_sync_grad: 如果设置为 ``True``，则将训练的 ``backward pass`` 与梯度的 ``all-"
 "reduce`` 通信重叠"
 msgstr "overlap_sync_grad: If set True, overlapping training backward pass with gradients' all-reduce communication."
 #: ../../source/parallel.rst:147 bcb1aedd8a89441488b211cd81d4f80c
 msgid ""
 "overlap_sync_param: 如果设置为 ``True``，则将参数的 ``broadcast`` 通信与下一步的 ``forward "
 "pass`` 进行重叠"
 msgstr "overlap_sync_param: If set True, overlapping parameters' broadcast communication with next step's forward pass."
 #: ../../source/parallel.rst:149 3ba64e4762084e93ba62a70c909e7d82
 msgid "这些优化可以加速训练过程，提高训练效率。"
 msgstr "These optimizations can speed up the training process and improve training efficiency."
 #: 757dad6b9916403c83042b49eaa35ae5
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer:1 of
 msgid "Hybrid Zero Optimizer."
 msgstr ""
 #: 83bcd49c056446f6806a55e6138579f2
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad:1
 #: of
 msgid ""
 "Set parameter gradients to zero. If set_to_none = True, gradient will be "
 "set to None to save memory."
 msgstr ""
 #: 2d3da89d360c458f80844f9caed6c316
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad:4
 #: of
 msgid "Whether set the gradient to None. Default value is True."
 msgstr ""
 #: 4164523156dc460cbbeaa17feed3c689
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:1 of
 msgid "Performs a single optimization step."
 msgstr ""
 #: 5c68dace1ec649bfa849b6652051daac
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:3 of
 msgid "A closure that reevaluates the model and returns the loss."
 msgstr ""
 #: 91e366d604ce48afa6b92666ece87b85
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:7 of
 msgid "Whether the gradient is success updated, and the gradient."
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
@ -0,0 +1,141 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/profiler.rst:2 81b1b5f4414449dfaf107815a911f300
 msgid "性能分析"
 msgstr "Profiler"
 #: ../../source/profiler.rst:7 d709646ebb314e9abb6a4839a21180bd
 msgid "Torch Profiler"
 msgstr ""
 #: ../../source/profiler.rst:9 4b5b73486c794c7a9168ad19999e12e1
 msgid ""
 "InternLM 使用 ``internlm.train.initialize_llm_profile()`` "
 "来收集和分析模型训练或推理期间的性能数据，如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler "
 "<https://pytorch.org/docs/stable/profiler.html>`_ ，输出的性能分析 trace 文件可以使用 "
 "`tensorboard <https://www.tensorflow.org>`_ 进行可视化。"
 msgstr ""
 "InternLM uses ``internlm.train.initialize_llm_profile()`` to profile performance data, execution time duration and breakdown analysis of "
 "step time. The implementation is based on `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ and output tracing files can "
 "be visualized with `tensorboard <https://www.tensorflow.org>`_."
 #: ../../source/profiler.rst:11 40ff4289735c43fdbeca871b65e82be4
 msgid ""
 "用户如果想使用这个 torch 性能分析工具，需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch "
 "性能分析后，用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` "
 "文件夹中看到性能分析结果。"
 msgstr ""
 "To use this torch profiler tool, you need to enable profiling by passing the ``--profiling`` flag when starting training. After torch "
 "profiling is completed, you can find the profiling results in the ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` folder."
 #: 876a2993b82645f7b56553fe64b514ec
 #: internlm.train.training_internlm.initialize_llm_profile:1 of
 msgid "Initialize and return the profiler context manager instance."
 msgstr ""
 #: ../../source/profiler.rst:16 3ab9536155ea4f3b8adb318005970bb8
 msgid "Memory Profiler"
 msgstr ""
 #: ../../source/profiler.rst:18 0ec4091fef5b47c58488618bfb4dcd3b
 msgid ""
 "InternLM 提供了一个实用的内存分析工具 "
 "``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` 来监控实际的 GPU"
 " 内存使用情况。在实现中，会对模型数据（包括模型参数、模型梯度和优化器状态）和非模型数据（包括激活值）分别进行详细的统计。"
 msgstr ""
 "InternLM provides a practical solution ``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` to monitor actual GPU memory usage. "
 "In the implmentation, model data (including model parameters, model gradients, and optimizer states) and non-model data "
 "(including activations) are calculated."
 #: ../../source/profiler.rst:20 cd62bbd5b122480da21e10453b95090c
 msgid ""
 "要使用这个内存分析工具，用户需要在启动训练时传递 ``--profiling`` 参数以启用内存分析。完成内存分析后，用户可以在 "
 "``memory_trace/rank{}_dp{}_tp{}`` 文件夹中找到特定 rank "
 "对应的内存分析结果（包括不同时间点的内存使用日志和显示总体内存使用情况的太阳图表）。"
 msgstr ""
 "To use this memory profiler tool, you need to enable profiling by passing the ``--profiling`` flag when starting training. After memory "
 "profiling is completed, you can find the profiling results (including logs of memory usage at different time point and sunburst charts "
 "showing overall memory usage) for a specific rank device in the ``memory_trace/rank{}_dp{}_tp{}`` folder."
 #: a858f1377b714cd5ab0cf749d8dbfeb7
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:1 of
 msgid "A memory profiler for a llm model."
 msgstr ""
 #: 08d4cca2ba154080ba72e7d3fbd2a344 36e25696cf7b4a8ca5472e86fd5eea7e
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point of
 msgid "参数"
 msgstr ""
 #: dea424767bc44ff689d582c67b07d637
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:3 of
 msgid "The model to profile."
 msgstr ""
 #: 4f3892910fa14324810c3f33c6af4fdd
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:5 of
 msgid "The optimizer used for training the model."
 msgstr ""
 #: a698f2f57eef4e47a22faa546c687979
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:7 of
 msgid "The file to write the memory state information to."
 msgstr ""
 #: 448fc2b81c794d228ec4b413356289ea
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:9 of
 msgid "number of steps to trace."
 msgstr ""
 #: 85b3b9d4147547fd89c286f003395469
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:1 of
 msgid "Record the memory state."
 msgstr ""
 #: d474a46415674d35a2c87c57ebff20ea
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:3 of
 msgid "The options to include in the memory state. Defaults to \"\"."
 msgstr ""
 #: 16261fe5b1df4b13bd23f76d97caf1be
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:5 of
 msgid "Whether to create a new memory record file. Defaults to False."
 msgstr ""
 #: 3b18845958204f07a6b80b6afb2221f5 d11f76d03d0d456889dee6d267dd4b74
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step of
 msgid "返回"
 msgstr ""
 #: 0deeb9555efb4aa798fd9d146826e961 46b50da453f1475a88e096b5d6ed8afb
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:8
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:3 of
 msgid "None"
 msgstr ""
 #: 4f2331ac352d4057a852b013ca688ed3
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:1 of
 msgid "Update the memory state of the optimizer state."
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/qa.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/qa.po
@ -0,0 +1,25 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/qa.rst:2 e3b22a39640a40cfb527068a7f4bbfc9
 msgid "问&答"
 msgstr "Q&A"
--- a/doc/code-docs/locales/en/LC_MESSAGES/training.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/training.po
@ -0,0 +1,127 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 10:56+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../source/training.rst:2 6eafa5eb08e040039309a39cdb0f1bfe
 msgid "训练 API"
 msgstr "Training API"
 #: ../../source/training.rst:4 74d81f3d0ca54c839d4e80bd589aedb2
 msgid ""
 "InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` "
 "管理。在定义了训练引擎和调度器之后，我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。"
 msgstr ""
 "InternLM training API is managed in ``internlm.core.trainer.Trainer``. After defining the "
 "training engine and runtime scheduler, we can call training API to perform training, evaluation, "
 "zero gradients and parameter update steps."
 #: ../../source/training.rst:6 0e0cfddbb2334d3da99d3289edf4161d
 msgid "有关详细用法，请参阅 Trainer API 文档和示例。"
 msgstr "For detailed usage, please refer to Trainer API documentation and examples."
 #: 7ea10280a8f1489984cb9994aa08976b internlm.core.trainer.Trainer:1 of
 msgid ""
 "This is a class tending for easy deployments of users' training and "
 "evaluation instead of writing their own scripts."
 msgstr ""
 #: 7969dca55840451193bffd3b071ab3b3 aff576168b59460491bb5da0ce41ea74
 #: internlm.core.trainer.Trainer internlm.core.trainer.Trainer.execute_schedule
 #: of
 msgid "参数"
 msgstr ""
 #: 59754d3e9ee8452a872bf397c01e0d8c internlm.core.trainer.Trainer:4 of
 msgid "Engine responsible for the process function."
 msgstr ""
 #: 2d18ff15256e48f98901c7a7e0cbbe35 internlm.core.trainer.Trainer:6 of
 msgid "Runtime schedule. Defaults to None."
 msgstr ""
 #: 76f4b3c7feba40eca3ee2b32559c53f5 internlm.core.trainer.Trainer.engine:1 of
 msgid ""
 "Returns the engine that responsible for managing the training and "
 "evaluation process."
 msgstr ""
 #: c7eae2d4d06c4ef891e314902d80b7f3 internlm.core.trainer.Trainer.schedule:1 of
 msgid "Returns the runtime scheduler."
 msgstr ""
 #: cb495b21b3444881aec83803e92386d9
 #: internlm.core.trainer.Trainer.uses_pipeline:1 of
 msgid "Returns whether the pipeline parallel is used or not."
 msgstr ""
 #: 86b0b631189e46468281a397c5e97350 internlm.core.trainer.Trainer.train:1 of
 msgid "Sets the model to training mode."
 msgstr ""
 #: f997e13120ee4d8b9e45ea6698b3e2a6 internlm.core.trainer.Trainer.eval:1 of
 msgid "Sets the model to evaluation mode."
 msgstr ""
 #: a8179e50312d47dcbe9de0433a65c2f7 internlm.core.trainer.Trainer.zero_grad:1
 #: of
 msgid "Sets the gradient of all parameters in the model to zero."
 msgstr ""
 #: f936136ef9e0452ca439b7c66dc8884b internlm.core.trainer.Trainer.step:1 of
 msgid "Executes the parameter update step."
 msgstr ""
 #: 250e2af89cfd432c84d228f9e03c174c
 #: internlm.core.trainer.Trainer.execute_schedule:1 of
 msgid ""
 "Runs the forward, loss computation, and backward for the model. Returns a"
 " tuple of (output, label, loss)."
 msgstr ""
 #: 6ca7de83033b432792eb0d7935ea04da
 #: internlm.core.trainer.Trainer.execute_schedule:4 of
 msgid "The data iterator."
 msgstr ""
 #: 6d3044e75b3149beba3c659e15607b79
 #: internlm.core.trainer.Trainer.execute_schedule:6 of
 msgid "Additional keyword arguments."
 msgstr ""
 #: 99d5a297d6414c30b432acf2566f0d3c
 #: internlm.core.trainer.Trainer.execute_schedule of
 msgid "返回"
 msgstr ""
 #: b625ebf0cf874edba384456d33e740b4
 #: internlm.core.trainer.Trainer.execute_schedule:8 of
 msgid "A tuple of (output, label, loss)."
 msgstr ""
 #: 391cde57d2e2478d8f83a7ad270c2a65
 #: internlm.core.trainer.Trainer.execute_schedule of
 msgid "返回类型"
 msgstr ""
 #: d4c4fb0fbddb499786970509cf0c9e13
 #: internlm.core.trainer.Trainer.execute_schedule:9 of
 msgid "Tuple[:class:`torch.Tensor`]"
 msgstr ""
--- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
@ -0,0 +1,365 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2023, InternLM Team
 # This file is distributed under the same license as the InternLM package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2023-09-07 14:15+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
 "Language-Team: en <LL@li.org>\n"
 "Plural-Forms: nplurals=2; plural=(n != 1);\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Generated-By: Babel 2.12.1\n"
 #: ../../../usage.md:2 a64aaaa1525e4e01b0ddcebc42c24bbd
 msgid "使用教程"
 msgstr "Quickstart Guide"
 #: ../../../usage.md:4 f1b40737fb584d889b82c7f55b652977
 msgid ""
 "启动一个 Demo "
 "模型训练，需要进行三项准备，**安装**，**数据集准备**和**模型训练配置**。接下来，首先会介绍数据准备相关的操作，再简要描述模型训练配置相关的内容。"
 msgstr ""
 "To start a demo model training, you need to prepare three things: "
 "**installation**, **dataset preparation**, and **model training "
 "configuration**. In this guide, we will first cover the steps for dataset"
 " preparation and then briefly describe the model training configuration."
 #: ../../../usage.md:6 b35abe307c2f4d23866fff828308ebf2
 msgid "安装"
 msgstr "Installation"
 #: ../../../usage.md:7 64a8c1f5f71c45519e636aa7edba10bc
 msgid "请参考[安装文档](./install.md)进行安装。"
 msgstr ""
 "Please refer to the [installation guide](./install.md) for instructions "
 "on how to install the necessary dependencies."
 #: ../../../usage.md:9 bd96714d12ee415794dea5a4578bd8cd
 msgid "数据准备 （预训练）"
 msgstr "Dataset Preparation (Pre-training)"
 #: ../../../usage.md:11 5a0b39fb9da94e96b87db40d1f231a0c
 msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。"
 msgstr ""
 "The dataset for the InternLM training task includes a series of `bin` and"
 " `meta` files. A `tokenizer` is used to generate the training dataset "
 "from the original text files. The tokenizer model is imported by "
 "specifying the model parameter path in `tools/tokenizer.py`. Currently, "
 "`V7_sft.model` is provided to generate tokens. If you want to use a "
 "different model, you can directly modify the model parameter path in "
 "`tokenizer.py`."
 #: ../../../usage.md:13 3cef8126b8784af48d81cc140322909e
 msgid "可以运行以下命令生成原始数据对应的`bin`和`meta`文件，其中参数`text_input_path`表示原始文本数据路径，目前支持`txt`、`json`和`jsonl`三种输入格式，`bin_output_path`表示生成的`bin`文件的保存路径。"
 msgstr ""
 "You can run the following command to generate `bin` and `meta` files "
 "corresponding to the original data. The parameter `text_input_path` "
 "represents the path of the original text data, currently supporting "
 "`txt`, `json`, and `jsonl` formats, while `bin_output_path` represents "
 "the save path of the generated `bin` files."
 #: ../../../usage.md:18 107ff2280da14cb6a27f4e9857186333
 msgid "下面是一个数据处理的例子："
 msgstr "Here is an example of data processing:"
 #: ../../../usage.md:20 c11a9860263c4e2288a561f3435fa706
 msgid "给定一个包含原始数据集的文件`raw_data.txt`，原始数据集如下所示："
 msgstr ""
 "Given a file `raw_data.txt` containing the raw dataset, the raw dataset "
 "is shown below:"
 #: ../../../usage.md:27 4012599b42ab47bd979d2a0b79ca1147
 msgid "可以通过运行以下命令来生成`bin`和`meta`文件："
 msgstr ""
 "You can generate the `bin` and `meta` files by running the following "
 "command:"
 #: ../../../usage.md:32 cca91b6cf53a4082932dd34ea4b7f954
 msgid "需要注意的是，生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这六个目录下，以区分数据集的类型。"
 msgstr ""
 "It should be noted that the generated `bin` files need to be saved in one"
 " of the following directories: `cn`, `en`, `code`, `ja`, `ar`, or "
 "`kaoshi`, depending on the type of dataset."
 #: ../../../usage.md:34 417312ca1e35479e811953f777e3565a
 msgid "其中，`cn`表示中文数据集；`en`表示英文数据集；`code`表示代码数据集；`ja`表示日语数据集；`ar`表示阿拉伯语数据集；`kaoshi`表示考试数据集。"
 msgstr ""
 "Here, `cn` represents the Chinese dataset, `en` represents the English "
 "dataset, `code` represents the code dataset, `ja` represents the Japanese"
 " dataset, `ar` represents the Arabic dataset, and `kaoshi` represents the"
 " exam dataset."
 #: ../../../usage.md:36 79c21f8e89b34499ba4e25e20593ec28
 msgid "生成的bin文件的格式如下："
 msgstr "The format of the generated `bin` files is as follows:"
 #: ../../../usage.md:42 26388d996c4e4116bc216be9bc007f62
 msgid "`bin`文件中的每一行均对应原始数据集中的每一个句子，表示每个句子的`token`（下文将用sequence指定）。"
 msgstr ""
 "Each line in the `bin` file corresponds to each sentence in the original "
 "dataset, representing the tokens of each sentence (referred to as "
 "sequence below)."
 #: ../../../usage.md:44 b39148a85ee64a349975d26282fbe59b
 msgid "生成的`meta`文件的格式如下："
 msgstr "The format of the generated `meta` file is as follows:"
 #: ../../../usage.md:48 175a6007197a40568535f945672e5df2
 msgid ""
 "在`meta`文件中，每个元组对应着`bin`文件中每一个`sequence`的元信息。其中，元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting"
 " index`，第二个元素表示每个`sequence`中有多少个`tokens`。"
 msgstr ""
 "Each tuple in the `meta` file represents the meta information of each "
 "`sequence`, where the first element in the tuple indicates the `starting "
 "index` of each `sequence` among all `sequences`, and the second element "
 "indicates the number of `tokens` for each `sequence`."
 #: ../../../usage.md:50 46874a3de3924837979f9949f1237e39
 msgid ""
 "例如，对于第一个`sequence`，`starting index`为 0，有 11 "
 "个`tokens`；对于第二个`sequence`，由于第一个`sequence`转换为`string`后的长度为`89`，因此它的`starting"
 " index`为 90，有 15 个`tokens`。"
 msgstr ""
 "For example, the first `sequence` starts at index 0 and has 16 `tokens`. "
 "The second `sequence` starts at index 110 and has 24 `tokens`."
 #: ../../../usage.md:52 25ea049fa411408b8856e7aa657835ab
 msgid "`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致，此处不再赘叙。"
 msgstr ""
 "The `bin` and `meta` file formats for `json` and `jsonl` type files are "
 "the same as for `txt`, so we won't go over them here."
 #: ../../../usage.md:54 bc52f959cb57494483a181e843014ed1
 msgid "数据准备 （微调）"
 msgstr "Data Preparation (Fine-tuning)"
 #: ../../../usage.md:56 73c74620c2994486acc747ba0c7f0b46
 msgid ""
 "微调任务的数据集格式与预训练任务保持一致，生成的数据格式为一系列的`bin`和`meta`文件。以下以 Alpaca "
 "数据集为例，介绍微调的数据准备流程。"
 msgstr ""
 "The data format for fine-tuning tasks is the same as for pre-training "
 "tasks, which consists of a series of `bin` and `meta` files. Let's take "
 "the Alpaca dataset as an example to explain the data preparation process "
 "for fine-tuning."
 #: ../../../usage.md:58 75f0e22d10ca413389ec8b947ae6141f
 msgid ""
 "下载 [Alpaca 数据集](https://github.com/tatsu-"
 "lab/stanford_alpaca/blob/main/alpaca_data.json)"
 msgstr ""
 "Download the [Alpaca dataset](https://github.com/tatsu-"
 "lab/stanford_alpaca/blob/main/alpaca_data.json)."
 #: ../../../usage.md:60 667606fcea454af48353a5b40f82fc46
 msgid "对 Alpaca 数据进行 tokenize，使用以下命令"
 msgstr "Tokenize the Alpaca dataset using the following command:"
 #: ../../../usage.md:66 60283b9237c8462ea37288b8ece79081
 msgid "建议用户参考 alpaca_tokenizer.py 编写新的脚本对自己的数据集进行 tokenize"
 msgstr ""
 "It is recommended that users refer to alpaca_tokenizer.py to write new "
 "scripts to tokenize their own datasets"
 #: ../../../usage.md:68 cdf45a4de9874e9fb65f7104dcee3c61
 msgid "训练配置"
 msgstr "Training Configuration"
 #: ../../../usage.md:70 7c42ebc23246450cbc1270e1461b16f6
 msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例，介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。"
 msgstr ""
 "Taking the configuration file `configs/7B_sft.py` for the 7B demo as an "
 "example, let's discuss the data, model, and parallel configurations "
 "required to start a model training."
 #: ../../../usage.md:72 247cfe98a7f44c2293aa2e2351f1ea69
 msgid "数据配置"
 msgstr "Data Configuration"
 #: ../../../usage.md:73 31327e7dce5848778db5361b3fbded1c
 msgid "数据相关的关键参数配置及释义如下所示："
 msgstr "Here are the key parameters and their explanations for data configuration:"
 #: ../../../usage.md:88 4d2608136fef4141bd6e47f78b8591b2
 msgid "![pack_into_one](./imgs/pack_into_one.png)"
 msgstr ""
 #: ../../../usage.md:88 c5acb028f2694712b2af788a864d5927
 msgid "pack_into_one"
 msgstr ""
 #: ../../../usage.md:91 db6b9ce8e8294952845893dd7aad098f
 msgid "目前支持传入数据集文件路径`train_folder`，且要求文件格式如下："
 msgstr ""
 "Currently, it supports passing the dataset file path `train_folder`, and "
 "the file format is required to be as follows:"
 #: ../../../usage.md:98 f22536fc3dfa4552a103a7cb57a20f92
 msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。"
 msgstr ""
 "For detailed information about the dataset, please refer to the \"Data "
 "Preparation\" section."
 #: ../../../usage.md:100 bc4f0b06e9c24730a7a831b7aca417e2
 msgid "模型配置"
 msgstr "Model Configuration"
 #: ../../../usage.md:102 ecf278a0a851496fae2e49c436e59368
 msgid "如果在启动训练时要加载模型 `checkpoint`，可进行如下相关配置："
 msgstr ""
 "If you want to load a model checkpoint when starting the training, you "
 "can configure it as follows:"
 #: ../../../usage.md:115 38244aba74294067a4019d0777621746
 msgid "注意："
 msgstr "Note:"
 #: ../../../usage.md:116 19d1eb0a797f4bd9a702a00e525d7753
 msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
 msgstr ""
 "`load_model_only_folder` and `load_ckpt_folder` cannot be set at the same"
 " time."
 #: ../../../usage.md:117 3ea27a1f6be044a3959890be69311b24
 msgid "路径若以 `local:` 为前缀，则存储在本地文件系统；若以 `boto3:` 为前缀，则存储在远程 oss 上"
 msgstr ""
 "If the path starts with `local:`, it means the file is stored in the "
 "local file system. If it starts with `boto3:`, it means the file is "
 "stored in the remote OSS."
 #: ../../../usage.md:119 1d6381b4cfff41d8bdd5347e8a135869
 msgid "模型相关关键参数配置如下所示："
 msgstr "The configuration for the model is as follows:"
 #: ../../../usage.md:143 1026791c9f054576857ef1930db6b167
 msgid "注意：用户可自定义模型类型名和模型结构，并配置相对应的模型参数。通过`utils/registry.py`下的`MODEL_INITIALIZER`对象进行模型初始化函数接口注册，在训练主函数`train.py`中初始化模型时，可通过`model_type`配置获取指定的模型初始化接口函数。"
 msgstr ""
 "Note: Users can customize the model type name and model structure, and "
 "configure the corresponding model parameters. The model initialization "
 "function interface can be registered through the `MODEL_INITIALIZER` "
 "object in `utils/registry.py`. When initializing the model in the "
 "training main function `train.py`, the specified model initialization "
 "interface function can be obtained through the `model_type` "
 "configuration."
 #: ../../../usage.md:145 34823bcbe7754190bc9747758c1aad0c
 msgid ""
 "*如果基于 InternLM 7B继续训练，可以参考 "
 "[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 "
 "OpenXLab 链接下载权重*"
 msgstr ""
 "*If you want to start training based on InternLM 7B, you can refer to "
 "OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-"
 "zoo) to download weights*."
 #: ../../../usage.md:147 4cabc928f8884cd38a6bb683b3bfade3
 msgid "并行配置"
 msgstr "Parallel Configuration"
 #: ../../../usage.md:149 f97ade07340340959345e73567bae793
 msgid "训练并行配置样例如下："
 msgstr "Training parallel configuration example:"
 #: ../../../usage.md:158 87fb5a4e4a4047ee8a9b8bb43915636d
 msgid "zero1：zero 并行策略，分如下三种情况，默认值为 -1"
 msgstr ""
 "zero1: zero parallel strategy, divided into the following three cases, "
 "default value is -1"
 #: ../../../usage.md:159 58dc08e2c52e4aaba99b4fbb6cf2e8b4
 #, fuzzy
 msgid "当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
 msgstr ""
 "When `zero1 <= 0`, the size of the zero1 process group is equal to the "
 "size of the data parallel process group, so the optimizer state "
 "parameters will be split within the data parallel range."
 #: ../../../usage.md:160 67e2ebd795d840b29fd1d684a068e90d
 #, fuzzy
 msgid "当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
 msgstr ""
 "When `zero1 == 1`, zero1 is not used, and all data parallel groups retain "
 "the complete optimizer state parameters."
 #: ../../../usage.md:161 7caedfc943514b9b83090b858ef6d163
 #, fuzzy
 msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集"
 msgstr ""
 "When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process"
 " group is a subset of the data parallel process group."
 #: ../../../usage.md:162 b38d3a1f72d543c6a44728fb6babea6b
 msgid "tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1"
 msgstr ""
 "tensor: tensor parallel size, usually the number of GPUs per node, "
 "default is 1"
 #: ../../../usage.md:163 237ac76df68f4a999396dad37c5495c3
 msgid "pipeline：流水线并行策略"
 msgstr "pipeline: pipeline parallel strategy"
 #: ../../../usage.md:164 c8c38f6ab2ea432eb9ebbb62618ca33e
 msgid "size：流水线并行大小，默认值为 1"
 msgstr "size: pipeline parallel size, the default value is 1"
 #: ../../../usage.md:165 b9158818e72e49acbdd52ad317cb80df
 msgid "interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭"
 msgstr ""
 "interleaved_overlap: bool type, when interleaved scheduling, enable or "
 "disable communication optimization, the default value is False"
 #: ../../../usage.md:166 28e4d48661ff4f80aff788fdda604433
 msgid "sequence_parallel：是否开启序列化并行，默认值为 False"
 msgstr ""
 "sequence_parallel: Whether to enable sequence parallelism, the default "
 "value is False"
 #: ../../../usage.md:168 27528ab826824d2280506460e1f2f7bd
 msgid "注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`"
 msgstr ""
 "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size"
 " / Tensor parallel size`"
 #: ../../../usage.md:170 5a7af23cec604f1d9096a5ab81993c87
 msgid "启动训练"
 msgstr "Start Training"
 #: ../../../usage.md:172 795e51542ed84cea83b63c5233bb88bc
 msgid "完成了以上数据集准备和相关训练配置后，可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例，介绍训练启动方式。"
 msgstr ""
 "After completing the data preparation and relevant training "
 "configurations mentioned above, you can start the demo training. The "
 "following examples demonstrate how to start the training in both slurm "
 "and torch environments."
 #: ../../../usage.md:174 96402cbe443044c0a0a1695c9847140b
 msgid "若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示："
 msgstr ""
 "If you want to start distributed training on slurm with 16 GPUs across "
 "multiple nodes, use the following command:"
 #: ../../../usage.md:179 c569e60401a6471eb9af2473acc4d5a6
 msgid "若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示："
 msgstr ""
 "If you want to start distributed training on torch with 8 GPUs on a "
 "single node, use the following command:"
 #: ../../../usage.md:184 a045a060d0734aab9d894aed553cef34
 msgid "运行结果"
 msgstr "Training Results"
 #: ../../../usage.md:186 c68e8dfa259647c7a6e6e0c0446b0b18
 msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例，训练结果日志展示如下："
 msgstr ""
 "Taking the configuration of the demo training on a single machine with 8 "
 "GPUs on slurm as an example, the training result log is shown below:"
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@ -1,10 +1,11 @@
 Sphinx
 sphinx-autobuild
 recommonmark
 sphinx_rtd_theme
 sphinx_markdown_tables
 autodoc_pydantic==1.9
 enum_tools
 numpy
 torch
-tqdm
+tqdm
 pyecharts
 myst-parser
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@ -1,2 +1,11 @@
-Model Checkpointing
+模型保存
-===================
+===================
 InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。 其中，可以
 使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。
 Checkpointing
 -------------
 .. autoclass:: internlm.utils.model_checkpoint.CheckpointManager
    :members:
--- a/doc/code-docs/source/conf.py
+++ b/doc/code-docs/source/conf.py
@ -12,19 +12,25 @@ import sys
 project = "InternLM"
 copyright = "2023, InternLM Team"
 author = "InternLM Team"
-release = "v0.2.0"
+
 with open("../../../version.txt", "r") as f:
    release = f.readline().rstrip()
 master_doc = "index"
 autodoc_member_order = "bysource"
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    "recommonmark",
    "sphinx_rtd_theme",
    "sphinx.ext.viewcode",
    "sphinx.ext.autodoc",
    "sphinxcontrib.autodoc_pydantic",
    "sphinx.ext.autosectionlabel",
    "sphinx.ext.napoleon",
    "myst_parser",
 ]
 pygments_style = "sphinx"
@ -71,7 +77,7 @@ html_static_path = ["_static"]
 # GitHub integration
 html_context = {
    "display_github": True,
-    "github_user": "pjlab",
+    "github_user": "InternLM",
    "github_repo": "InternLM",
    "github_version": "master",
    "conf_py_path": "/doc/code-docs/source/",
@ -89,3 +95,9 @@ autodoc_mock_imports = [
    "torch",
    "numpy",
 ]
 # support multi-language docs
 language = "zh_CN"
 locale_dirs = ["../locales/"]  # path is example but recommended.
 gettext_compact = False  # optional.
 gettext_uuid = False  # optional.
--- a/doc/code-docs/source/example/30B_demo.rst
+++ b/doc/code-docs/source/example/30B_demo.rst
@ -0,0 +1,202 @@
 30B Demo
 ================
 训练配置
 ----------------
 30B demo 训练配置文件样例如下:
 .. code-block:: python
    JOB_NAME = "30b_train"
    SEQ_LEN = 2048
    HIDDEN_SIZE = 6144
    NUM_ATTENTION_HEAD = 48
    MLP_RATIO = 8 / 3
    NUM_LAYER = 60
    VOCAB_SIZE = 103168
    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
    # Ckpt folder format:
    # fs: 'local:/mnt/nfs/XXX'
    SAVE_CKPT_FOLDER = "local:llm_ckpts"
    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
    # boto3 Ckpt folder format:
    # import os
    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
    CHECKPOINT_EVERY = 50
    ckpt = dict(
        enable_save_ckpt=False,  # enable ckpt save.
        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
        checkpoint_every=CHECKPOINT_EVERY,
        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
    )
    TRAIN_FOLDER = "/path/to/dataset"
    VALID_FOLDER = "/path/to/dataset"
    data = dict(
        seq_len=SEQ_LEN,
        # micro_num means the number of micro_batch contained in one gradient update
        micro_num=4,
        # packed_length = micro_bsz * SEQ_LEN
        micro_bsz=2,
        # defaults to the value of micro_num
        valid_micro_num=4,
        # defaults to 0, means disable evaluate
        valid_every=50,
        pack_sample_into_one=False,
        total_steps=50000,
        skip_batches="",
        rampup_batch_size="",
        # Datasets with less than 50 rows will be discarded
        min_length=50,
        # train_folder=TRAIN_FOLDER,
        # valid_folder=VALID_FOLDER,
    )
    grad_scaler = dict(
        fp16=dict(
            # the initial loss scale, defaults to 2**16
            initial_scale=2**16,
            # the minimum loss scale, defaults to None
            min_scale=1,
            # the number of steps to increase loss scale when no overflow occurs
            growth_interval=1000,
        ),
        # the multiplication factor for increasing loss scale, defaults to 2
        growth_factor=2,
        # the multiplication factor for decreasing loss scale, defaults to 0.5
        backoff_factor=0.5,
        # the maximum loss scale, defaults to None
        max_scale=2**24,
        # the number of overflows before decreasing loss scale, defaults to 2
        hysteresis=2,
    )
    hybrid_zero_optimizer = dict(
        # Enable low_level_optimzer overlap_communication
        overlap_sync_grad=True,
        overlap_sync_param=True,
        # bucket size for nccl communication params
        reduce_bucket_size=512 * 1024 * 1024,
        # grad clipping
        clip_grad_norm=1.0,
    )
    loss = dict(
        label_smoothing=0,
    )
    adam = dict(
        lr=1e-4,
        adam_beta1=0.9,
        adam_beta2=0.95,
        adam_beta2_c=0,
        adam_eps=1e-8,
        weight_decay=0.01,
    )
    lr_scheduler = dict(
        total_steps=data["total_steps"],
        init_steps=0,  # optimizer_warmup_step
        warmup_ratio=0.01,
        eta_min=1e-5,
        last_epoch=-1,
    )
    beta2_scheduler = dict(
        init_beta2=adam["adam_beta2"],
        c=adam["adam_beta2_c"],
        cur_iter=-1,
    )
    model = dict(
        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
        num_attention_heads=NUM_ATTENTION_HEAD,
        embed_split_hidden=True,
        vocab_size=VOCAB_SIZE,
        embed_grad_scale=1,
        parallel_output=True,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYER,
        mlp_ratio=MLP_RATIO,
        apply_post_layer_norm=False,
        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
        norm_type="rmsnorm",
        layer_norm_epsilon=1e-5,
        use_flash_attn=True,
        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
    )
    """
    zero1 parallel:
        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
            so parameters will be divided within the range of dp.
        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
    pipeline parallel (dict):
        1. size: int, the size of pipeline parallel.
        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
    tensor parallel: tensor parallel size, usually the number of GPUs per node.
    """
    parallel = dict(
        zero1=-1,
        tensor=4,
        pipeline=dict(size=1, interleaved_overlap=True),
        sequence_parallel=False,
    )
    cudnn_deterministic = False
    cudnn_benchmark = False
 启动训练
 ----------------
 完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动两节点 16GPU 的训练命令如下所示：
 .. code-block:: bash
    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/30B_sft.py
 训练结果
 ----------------
 基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下：
 .. code-block:: bash
    2023-09-06 10:29:26,629 INFO parallel_context.py:508 in set_device -- process rank 10 is bound to host:HOST-10-140-66-20 device: 2
    2023-09-06 10:29:26,632 INFO parallel_context.py:508 in set_device -- process rank 11 is bound to host:HOST-10-140-66-20 device: 3
    2023-09-06 10:29:26,634 INFO parallel_context.py:508 in set_device -- process rank 12 is bound to host:HOST-10-140-66-20 device: 4
    2023-09-06 10:29:26,636 INFO parallel_context.py:508 in set_device -- process rank 9 is bound to host:HOST-10-140-66-20 device: 1
    2023-09-06 10:29:26,640 INFO parallel_context.py:508 in set_device -- process rank 15 is bound to host:HOST-10-140-66-20 device: 7
    2023-09-06 10:29:26,639 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:HOST-10-140-66-9 device: 0
    2023-09-06 10:29:26,641 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:HOST-10-140-66-9 device: 2
    2023-09-06 10:29:26,643 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:HOST-10-140-66-9 device: 5
    2023-09-06 10:29:26,645 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:HOST-10-140-66-9 device: 6
    2023-09-06 10:29:26,661 INFO parallel_context.py:508 in set_device -- process rank 13 is bound to host:HOST-10-140-66-20 device: 5
    2023-09-06 10:29:26,707 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:HOST-10-140-66-9 device: 1
    2023-09-06 10:29:26,826 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:HOST-10-140-66-9 device: 4
    2023-09-06 10:29:26,871 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:HOST-10-140-66-9 device: 7
    2023-09-06 10:29:26,932 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:HOST-10-140-66-9 device: 3
    2023-09-06 10:29:27,156 INFO parallel_context.py:508 in set_device -- process rank 14 is bound to host:HOST-10-140-66-20 device: 6
    2023-09-06 10:29:27,271 INFO parallel_context.py:508 in set_device -- process rank 8 is bound to host:HOST-10-140-66-20 device: 0
    2023-09-06 10:29:32,060 INFO launch.py:329 in launch -- Distributed environment is initialized, data parallel size: 4, pipeline parallel size: 1, tensor parallel size: 4
    2023-09-06 10:30:06,141 INFO hybrid_zero_optim.py:291 in _partition_param_list -- Number of elements on ranks: [1782007296, 1812307968, 1812307968, 1706469888], rank:0
    2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan 
    2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan 
    2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan 
    2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan 
    2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan 
    2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan
--- a/doc/code-docs/source/example/7B_demo.rst
+++ b/doc/code-docs/source/example/7B_demo.rst
@ -0,0 +1,192 @@
 7B Demo
 ================
 训练配置
 ----------------
 7B demo 的训练配置文件样例如下:
 .. code-block:: python
    JOB_NAME = "7b_train"
    SEQ_LEN = 2048
    HIDDEN_SIZE = 4096
    NUM_ATTENTION_HEAD = 32
    MLP_RATIO = 8 / 3
    NUM_LAYER = 32
    VOCAB_SIZE = 103168
    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
    # Ckpt folder format:
    # fs: 'local:/mnt/nfs/XXX'
    SAVE_CKPT_FOLDER = "local:llm_ckpts"
    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
    # boto3 Ckpt folder format:
    # import os
    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
    CHECKPOINT_EVERY = 50
    ckpt = dict(
        enable_save_ckpt=False,  # enable ckpt save.
        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
        checkpoint_every=CHECKPOINT_EVERY,
        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
    )
    TRAIN_FOLDER = "/path/to/dataset"
    VALID_FOLDER = "/path/to/dataset"
    data = dict(
        seq_len=SEQ_LEN,
        # micro_num means the number of micro_batch contained in one gradient update
        micro_num=4,
        # packed_length = micro_bsz * SEQ_LEN
        micro_bsz=2,
        # defaults to the value of micro_num
        valid_micro_num=4,
        # defaults to 0, means disable evaluate
        valid_every=50,
        pack_sample_into_one=False,
        total_steps=50000,
        skip_batches="",
        rampup_batch_size="",
        # Datasets with less than 50 rows will be discarded
        min_length=50,
        # train_folder=TRAIN_FOLDER,
        # valid_folder=VALID_FOLDER,
    )
    grad_scaler = dict(
        fp16=dict(
            # the initial loss scale, defaults to 2**16
            initial_scale=2**16,
            # the minimum loss scale, defaults to None
            min_scale=1,
            # the number of steps to increase loss scale when no overflow occurs
            growth_interval=1000,
        ),
        # the multiplication factor for increasing loss scale, defaults to 2
        growth_factor=2,
        # the multiplication factor for decreasing loss scale, defaults to 0.5
        backoff_factor=0.5,
        # the maximum loss scale, defaults to None
        max_scale=2**24,
        # the number of overflows before decreasing loss scale, defaults to 2
        hysteresis=2,
    )
    hybrid_zero_optimizer = dict(
        # Enable low_level_optimzer overlap_communication
        overlap_sync_grad=True,
        overlap_sync_param=True,
        # bucket size for nccl communication params
        reduce_bucket_size=512 * 1024 * 1024,
        # grad clipping
        clip_grad_norm=1.0,
    )
    loss = dict(
        label_smoothing=0,
    )
    adam = dict(
        lr=1e-4,
        adam_beta1=0.9,
        adam_beta2=0.95,
        adam_beta2_c=0,
        adam_eps=1e-8,
        weight_decay=0.01,
    )
    lr_scheduler = dict(
        total_steps=data["total_steps"],
        init_steps=0,  # optimizer_warmup_step
        warmup_ratio=0.01,
        eta_min=1e-5,
        last_epoch=-1,
    )
    beta2_scheduler = dict(
        init_beta2=adam["adam_beta2"],
        c=adam["adam_beta2_c"],
        cur_iter=-1,
    )
    model = dict(
        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
        num_attention_heads=NUM_ATTENTION_HEAD,
        embed_split_hidden=True,
        vocab_size=VOCAB_SIZE,
        embed_grad_scale=1,
        parallel_output=True,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYER,
        mlp_ratio=MLP_RATIO,
        apply_post_layer_norm=False,
        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
        norm_type="rmsnorm",
        layer_norm_epsilon=1e-5,
        use_flash_attn=True,
        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
    )
    """
    zero1 parallel:
        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
            so parameters will be divided within the range of dp.
        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
    pipeline parallel (dict):
        1. size: int, the size of pipeline parallel.
        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
    tensor parallel: tensor parallel size, usually the number of GPUs per node.
    """
    parallel = dict(
        zero1=8,
        pipeline=dict(size=1, interleaved_overlap=True),
        sequence_parallel=False,
    )
    cudnn_deterministic = False
    cudnn_benchmark = False
 启动训练
 ----------------
 完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动单节点 8GPU 的训练命令如下所示：
 .. code-block:: bash
    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
 训练结果
 ----------------
 基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下：
 .. code-block:: bash
    2023-09-05 11:47:44,649 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:SH-IDC1-10-140-1-110 device: 4
    2023-09-05 11:47:44,650 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:SH-IDC1-10-140-1-110 device: 3
    2023-09-05 11:47:44,651 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:SH-IDC1-10-140-1-110 device: 6
    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:SH-IDC1-10-140-1-110 device: 7
    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:SH-IDC1-10-140-1-110 device: 5
    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:SH-IDC1-10-140-1-110 device: 1
    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:SH-IDC1-10-140-1-110 device: 2
    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:SH-IDC1-10-140-1-110 device: 0
    2023-09-05 11:47:51,006 INFO launch.py:354 in launch -- Distributed environment is initialized, data parallel size: 8, pipeline parallel size: 1, tensor parallel size: 1
    2023-09-05 11:49:09,855 INFO hybrid_zero_optim.py:294 in _partition_param_list -- Number of elements on ranks: [894509056, 944865280, 966909952, 966909952, 966909952, 944865280, 966909952, 670068736], rank:0
    2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan 
    2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan 
    2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan 
    2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan 
    2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan 
    2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan 
--- a/doc/code-docs/source/example/index.rst
+++ b/doc/code-docs/source/example/index.rst
@ -0,0 +1,18 @@
 训练样例
 ================
 7B Demo
 ------------
 .. toctree::
   :maxdepth: 2
   7B_demo
 30B Demo
 ------------
 .. toctree::
   :maxdepth: 2
   30B_demo
--- a/doc/code-docs/source/index.rst
+++ b/doc/code-docs/source/index.rst
@ -3,10 +3,11 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 InternLM
 ========
-Environment Setup
+环境构建
 -------------------
 .. toctree::
@ -14,7 +15,15 @@ Environment Setup
   install
-Model Setup
+快速上手
 -------------------
 .. toctree::
   :maxdepth: 2
   usage
 训练构建
 -------------------
 .. toctree::
@ -22,7 +31,7 @@ Model Setup
   initialize
-Training API
+训练 API
 -------------------
 .. toctree::
@ -30,7 +39,7 @@ Training API
   training
-Parallel Training
+并行训练
 -------------------
 .. toctree::
@ -38,15 +47,15 @@ Parallel Training
   parallel
-Model Checkpointing
+模型备份
-------------------
+--------------------
 .. toctree::
   :maxdepth: 2
   checkpoint
-Profiler
+性能分析
 -------------------
 .. toctree::
@ -54,7 +63,7 @@ Profiler
   profiler
-Monitor
+训练监控
 -------------------
 .. toctree::
@ -62,7 +71,23 @@ Monitor
   monitor
-Indices and tables
+训练样例
 -------------------
 .. toctree::
   :maxdepth: 2
   example/index
 常见问题
 -------------------
 .. toctree::
   :maxdepth: 2
   qa
 索引和表格
 ==================
 * :ref:`genindex`
--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@ -1,13 +1,12 @@
-Training Setup
+训练构建
 ==============
 .. _InternLM-args:
-Argument Parsing
+命令行参数解析
 ----------------
-InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline
+
-configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default
+InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_ 库来向InternLM运行时提供命令行参数配置。用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器，其中包含一些内置参数，用户可以向此解析器添加自定义参数。
 parser with some builtin arguments, users can add custom parameters to this parser.
 .. code-block:: python
@ -20,16 +19,70 @@ parser with some builtin arguments, users can add custom parameters to this pars
 .. autofunction:: internlm.initialize.get_default_parser
-.. _InternLM-init:
+.. _InternLM-model-init:
-Model Initialization
+模型初始化
 -------------------------
-Optimizer Initialization
+.. autofunction:: internlm.train.initialize_model
 InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下：
 .. code-block:: python
    model_type = "INTERNLM"  # default is "INTERNLM", used to register classes and modules for model initialization
    NUM_ATTENTION_HEAD = 32
    VOCAB_SIZE = 103168
    HIDDEN_SIZE = 4096
    NUM_LAYER = 32
    MLP_RATIO = 8 / 3
    model = dict(
        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
        num_attention_heads=NUM_ATTENTION_HEAD,
        embed_split_hidden=True,
        vocab_size=VOCAB_SIZE,
        embed_grad_scale=1,
        parallel_output=True,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYER,
        mlp_ratio=MLP_RATIO,
        apply_post_layer_norm=False,
        dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
        norm_type="rmsnorm",
        layer_norm_epsilon=1e-5,
        use_flash_attn=True,
        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
    )
 - 字段 ``model_type`` 指明了要初始化的模型类型
 - 字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置
 值得注意的是，用户可以定义新的模型类型，并使用装饰器 ``@MODEL_INITIALIZER.register_module`` 注册模型的初始化函数，其中 ``MODEL_INITIALIZER`` 是类 ``internlm.util.registry.Registry`` 的一个实例化对象，示例如下所示：
 .. code-block:: python
    MODEL_TYPE = "NEW_MODEL"
    @MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
    def build_new_model_with_cfg(*args, **kwargs):
 .. _InternLM-optim-init:
 优化器初始化
 -------------------------
-Dataloader Initialization
+.. autofunction:: internlm.train.initialize_optimizer
 .. _InternLM-dl-init:
 数据加载器初始化
 -------------------------
-Trainer Initialization
+.. autofunction:: internlm.train.get_train_data_loader
 .. _InternLM-trainer-init:
 Trainer 初始化
 -------------------------
 .. autofunction:: internlm.initialize.initialize_trainer
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@ -1,70 +1,2 @@
-## Installation
+```{include} ../../install.md
-
+```
 ### Environment Preparation
 The required packages and corresponding version are shown as follows:
 - Python == 3.10
 - GCC == 10.2.0
 - MPFR == 4.1.0
 - CUDA >= 11.7
 - Pytorch >= 1.13.1
 - Transformers >= 4.28.0
 - Flash-Attention >= v1.0.5
 - Apex == 23.05
 - GPU with Ampere or Hopper architecture (such as H100, A100)
 - Linux OS
 After installing the above dependencies, some system environment variables need to be updated:
 ```bash
 export CUDA_PATH={path_of_cuda_11.7}
 export GCC_HOME={path_of_gcc_10.2.0}
 export MPFR_HOME={path_of_mpfr_4.1.0}
 export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
 export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
 export CC=${GCC_HOME}/bin/gcc
 export CXX=${GCC_HOME}/bin/c++
 ```
 ### Environment Installation
 Clone the project `internlm` and its dependent submodules from the github repository, as follows:
 ```bash
 git clone git@github.com:InternLM/InternLM.git --recurse-submodules
 ```
 It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
 ```bash
 conda create --name internlm-env python=3.10 -y
 conda activate internlm-env
 cd internlm
 pip install -r requirements/torch.txt 
 pip install -r requirements/runtime.txt 
 ```
 Install flash-attention (version v1.0.5):
 ```bash
 cd ./third_party/flash-attention
 python setup.py install
 cd ./csrc
 cd fused_dense_lib && pip install -v .
 cd ../xentropy && pip install -v .
 cd ../rotary && pip install -v .
 cd ../layer_norm && pip install -v .
 cd ../../../../
 ```
 Install Apex (version 23.05):
 ```bash
 cd ./third_party/apex
 pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
 cd ../../
 ```
 ### Environment Image
 Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
 ```bash
 # pull image
 docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
 # start container
 docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
 docker exec -it myinternlm bash
 ```
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@ -1,10 +1,22 @@
-Monitor and Alert
+监控和告警
 =================
-
+监控
 Monitoring
 -----------------
 InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` 来初始化上下文监控管理。其中，一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` 将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。
-Alerting
+.. autofunction:: internlm.monitor.monitor.initialize_monitor_manager
 .. autoclass:: internlm.monitor.monitor.MonitorManager
    :members:
 .. autoclass:: internlm.monitor.monitor.MonitorTracker
    :members:
 告警
 -----------------
 InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等，并捕获 SIGTERM 异常信号。当出现上述情况时，将触发警报，并通过调用 ``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook 地址发送报警消息。
 .. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@ -1,23 +1,152 @@
-Parallel Training
+并行训练
-=================
+==================
-.. 整体说一下并行配置使用方式，接下来再分模块详细说明
+.. Brief introduction to training parallelism, and how-to guide about config setting
-Tensor Parallel
+InternLM 支持张量并行、流水线并行、序列并行、数据并行和 ZeRO1.5 等并行化训练策略。在初始化分布式环境时，我们需要指定张量并行大小、流水线并行大小、数据并行大小以及 ZeRO1.5 策略。
 InternLM 的并行设置由配置文件中的 ``parallel`` 字段指定，用户可以通过修改配置文件 `config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_ 来更改并行配置。以下是一个并行训练配置示例：
 .. code-block:: python
    parallel = dict(
        zero1=8,
        tensor=1,
        pipeline=dict(size=1, interleaved_overlap=True),
        sequence_parallel=False,
    )
 - zero1：zero 并行策略，分如下三种情况，默认值为 -1
    - 当 ``zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
    - 当 ``zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
    - 当 ``zero1 > 1`` 且 ``zero1 <= data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集
 - tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
 - pipeline：流水线并行策略
    - size：流水线并行大小，默认值为 1
    - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为 False
 - sequence_parallel：是否开启序列化并行，默认值为 False
 注意：数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小
 张量并行
 -----------------
 InternLM 的张量并行实现方案基于 `flash attention <https://github.com/Dao-AILab/flash-attention>`_, 主要对 `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ 和
 `linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ 这两个模块进行张量并行操作。
-Pipeline Parallel
+用户可通过配置文件中的 ``parallel.tensor`` 字段来设置张量并行大小。
 .. figure:: ../../imgs/tensor_parallel.png
  :scale: 50%
  :class: with-border
  张量并行，采用自 `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_
 流水线并行
 -----------------
 InternLM 在流水线并行中使用 `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ （1F1B，一次前向传递后跟一次反向传递）策略。对于 1F1B 策略，有两种实现方式：
-Sequence Parallel
+1. 非交错调度器，内存高效。
 2. 交错调度器，内存高效且时间高效（GPU空泡较少）。
 .. figure:: ../../imgs/pipeline_schedule.png
  :scale: 45%
  :class: with-border
  1F1B 流水线并行调度器，采用自 `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_
 非交错式流水线调度
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 如果要使用非交错式调度, 需要设置 ``model.num_chunks = 1``。
 .. autoclass:: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
    :members:
 交错式流水线调度
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 如果要使用交错式调度, 需要设置 ``model.num_chunks > 1``。
 .. autoclass:: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler
    :members:
 值得注意的是，在使用交错式流水线调度器时可启用通信优化功能，即在 1F1B 阶段启用异步通信，以充分利用上行/下行带宽并实现通信与计算重叠。
 用户需要在配置文件中设置 ``parallel.pipeline.interleaved_overlap = True``。该功能启用后，将调用函数 ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap``，并创建 ``internlm.core.communication.AsynCommunicator`` 以管理异步通信。
 ``1F1B-without-overlap`` 和 ``1F1B-with-overlap`` 的区别如下所示：
 .. code-block:: bash
    # The 1F1B stage without overlap consists of the following steps:
    1. Perform the forward pass.
    2. Perform the backward pass.
    3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration to the previous stage, and receive the forward and backward inputs for the next iteration.
 .. code-block:: bash
    # The 1F1B stage with overlap consists of the following steps:
    1. Perform the forward pass.
    2. Check if the backward input is ready.
    3. Send the forward output and receive the forward input for the next iteration.
    4. Perform the backward pass.
    5. Check if the forward input is ready.
    6. Send the backward output and receive the backward input for the next iteration.
 序列并行
 -----------------
 序列并行是一种在不引入额外计算、通信和内存开销的情况下，减少层 ``layer_norm`` 和 ``dropout`` 操作中的激活值内存。InternLM 中的序列并行实现基于 `flash attention <https://github.com/Dao-AILab/flash-attention>`_。这个并行策略有助于降低模型的内存消耗，提高了模型在资源受限环境中的可扩展性。
-Data Parallel
+如果要启用序列并行, 用户需要设置 ``parallel.sequence_parallel = True``。
 .. figure:: ../../imgs/sequence_parallel.png
  :scale: 50%
  :class: with-border
  序列并行, 采用自 flash-attention
 数据并行
 -----------------
 InternLM 支持数据并行。数据并行大小为:
 `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
 ZeRO1.5
-----------------
+-----------------
 ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zero1`` 启用了本地节点内的分片。这个方法有助于有效管理和分配模型参数和梯度，以减少内存使用并提高训练效率。
 1. 当 ``parallel.zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
 2. 当 ``parallel.zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
 3. 当 ``parallel.zero1 > 1`` 且 ``parallel.zero1 <= data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集
 此外，用户可以在配置文件中通过 ``hybrid_zero_optimizer`` 字段启用优化器的通信优化功能，设置桶大小，以及梯度剪裁等参数。这些设置有助于优化训练过程中的通信和计算效率，以及梯度的处理方式。
 .. code-block:: python
    hybrid_zero_optimizer = dict(
        # Enable low_level_optimzer overlap_communication
        overlap_sync_grad=True,  
        overlap_sync_param=True,
        # bucket size for nccl communication params
        reduce_bucket_size=512 * 1024 * 1024,
        # grad clipping
        clip_grad_norm=1.0,
    )
 这里有两个值得关注的通信优化点：
 - overlap_sync_grad: 如果设置为 ``True``，则将训练的 ``backward pass`` 与梯度的 ``all-reduce`` 通信重叠
 - overlap_sync_param: 如果设置为 ``True``，则将参数的 ``broadcast`` 通信与下一步的 ``forward pass`` 进行重叠
 这些优化可以加速训练过程，提高训练效率。
 .. autoclass:: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer
    :members:
--- a/doc/code-docs/source/profiler.rst
+++ b/doc/code-docs/source/profiler.rst
@ -1,11 +1,23 @@
-Profiler
+性能分析
 ========
-.. 可介绍torch profiler, memory profiler的使用
+.. Mainly about the usage of torch profiler and memory profiler
 Torch Profiler
 -----------------
 InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据，如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ ，输出的性能分析 trace 文件可以使用 `tensorboard <https://www.tensorflow.org>`_ 进行可视化。
 用户如果想使用这个 torch 性能分析工具，需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch 性能分析后，用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` 文件夹中看到性能分析结果。
 .. autofunction:: internlm.train.initialize_llm_profile
 Memory Profiler
-----------------
+-----------------
 InternLM 提供了一个实用的内存分析工具 ``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` 来监控实际的 GPU 内存使用情况。在实现中，会对模型数据（包括模型参数、模型梯度和优化器状态）和非模型数据（包括激活值）分别进行详细的统计。
 要使用这个内存分析工具，用户需要在启动训练时传递 ``--profiling`` 参数以启用内存分析。完成内存分析后，用户可以在 ``memory_trace/rank{}_dp{}_tp{}`` 文件夹中找到特定 rank 对应的内存分析结果（包括不同时间点的内存使用日志和显示总体内存使用情况的太阳图表）。
 .. autoclass:: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
    :members:
--- a/doc/code-docs/source/qa.rst
+++ b/doc/code-docs/source/qa.rst
@ -0,0 +1,2 @@
 问&答
 ====
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@ -1,2 +1,9 @@
-Training API
+训练 API
-============
+============
 InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` 管理。在定义了训练引擎和调度器之后，我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。
 有关详细用法，请参阅 Trainer API 文档和示例。
 .. autoclass:: internlm.core.trainer.Trainer
    :members:
--- a/doc/code-docs/source/usage.md
+++ b/doc/code-docs/source/usage.md
@ -0,0 +1,4 @@
 ```{include} ../../usage.md
 :relative-docs: docs/
 :relative-images:
 ```
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -1,4 +1,4 @@
-## InternLM Installation
+## Installation
 ### Environment Preparation
 The required packages and corresponding version are shown as follows:
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@ -1,4 +1,4 @@
-## Pre-training and Fine-tuning Tutorial for InternLM
+## Quickstart Guide for Pre-training and Fine-tuning
 To start a demo model training, you need to prepare three things: **installation**, **dataset preparation**, and **model training configuration**. In this guide, we will first cover the steps for dataset preparation and then briefly describe the model training configuration.
@ -93,10 +93,7 @@ data = dict(
 )
 ```
-<div align="left">
+![pack_into_one](../imgs/pack_into_one.png)
    <img src="../imgs/pack_into_one.png" width="550"/>
 </div>
 Currently, it supports passing the dataset file path `train_folder`, and the file format is required to be as follows:
@ -172,9 +169,9 @@ parallel = dict(
 ```
 - zero1: zero parallel strategy, divided into the following three cases, default value is -1
-  - When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
+  - When `zero1 <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
-  - When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
+  - When `zero1 == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
-  - When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
+  - When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
 - tensor: tensor parallel size, usually the number of GPUs per node, default is 1
 - pipeline: pipeline parallel strategy
   - size: pipeline parallel size, the default value is 1
--- a/doc/imgs/pipeline_schedule.png
+++ b/doc/imgs/pipeline_schedule.png
--- a/doc/imgs/sequence_parallel.png
+++ b/doc/imgs/sequence_parallel.png
--- a/doc/imgs/tensor_parallel.png
+++ b/doc/imgs/tensor_parallel.png
--- a/doc/install.md
+++ b/doc/install.md
@ -1,4 +1,4 @@
-## InternLM项目的依赖安装
+## 环境安装
 ### 环境准备
 首先，需要安装的依赖包及对应版本列表如下：
--- a/doc/usage.md
+++ b/doc/usage.md
@ -1,4 +1,4 @@
-## 基于InternLM的预训练与微调使用教程
+## 使用教程
 启动一个 Demo 模型训练，需要进行三项准备，**安装**，**数据集准备**和**模型训练配置**。接下来，首先会介绍数据准备相关的操作，再简要描述模型训练配置相关的内容。
@ -84,9 +84,7 @@ data = dict(
 )
 ```
-<div align="left">
+![pack_into_one](./imgs/pack_into_one.png)
    <img src="./imgs/pack_into_one.png" width="550"/>
 </div>
 目前支持传入数据集文件路径`train_folder`，且要求文件格式如下：
@ -156,9 +154,9 @@ parallel = dict(
 )
 ```
 - zero1：zero 并行策略，分如下三种情况，默认值为 -1
-  - 当`size <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+  - 当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
-  - 当`size == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+  - 当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
-  - 当`size > 1`且`size <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
+  - 当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
 - tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
 - pipeline：流水线并行策略
  - size：流水线并行大小，默认值为 1
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@ -26,13 +26,13 @@ class NonPipelineScheduler(BaseScheduler):
        gradient_accumulation_steps(int, optional): the steps of gradient accumulation, 1 for disable
            gradient accumulation.
-    Example:
+    Examples:
-        # this shows an example of customized data_process_func
+        >>> # this shows an example of customized data_process_func
-        def data_process_func(dataloader_output):
+        >>> def data_process_func(dataloader_output):
-            item1, item2, item3 = dataloader_output
+        >>>     item1, item2, item3 = dataloader_output
-            data = (item1, item2)
+        >>>     data = (item1, item2)
-            label = item3
+        >>>     label = item3
-            return data, label
+        >>>     return data, label
    """
    def __init__(
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@ -1073,8 +1073,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        1. Perform the forward pass.
        2. Perform the backward pass.
        3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration
-           to the previous stage,
+           to the previous stage, and receive the forward and backward inputs for the next iteration.
        and receive the forward and backward inputs for the next iteration.
        Args:
            engine (Engine): The engine to use for computation.
--- a/internlm/core/trainer.py
+++ b/internlm/core/trainer.py
@ -146,10 +146,12 @@ class Trainer:
    @property
    def engine(self):
        """Returns the engine that responsible for managing the training and evaluation process."""
        return self._engine
    @property
    def schedule(self):
        """Returns the runtime scheduler."""
        return self._schedule
    @property
@ -158,15 +160,19 @@ class Trainer:
        return isinstance(self._schedule, (PipelineScheduler, InterleavedPipelineScheduler))
    def train(self):
        """Sets the model to training mode."""
        self._engine.train()
    def eval(self):
        """Sets the model to evaluation mode."""
        self._engine.eval()
    def zero_grad(self):
        """Sets the gradient of all parameters in the model to zero."""
        self._engine.zero_grad()
    def step(self):
        """Executes the parameter update step."""
        return self._engine.step()
    def execute_schedule(self, data_iter: Iterable, **kwargs):
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@ -43,8 +43,8 @@ def initialize_trainer(
    loaded into gpc.config.
    Args:
-        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
+        model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
-        optimizer (:class:`BaseOptimizer`.
+        optimizer (:class:`BaseOptimizer`): Your optimizer for training.
        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -23,7 +23,7 @@ def get_default_parser():
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+       Parser: Returns the parser with the default arguments, the user may add customized arguments into this parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, help="path to the config file")
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -454,10 +454,9 @@ def build_model_with_cfg(
    use_scaled_init: bool = True,
    use_swiglu: bool = True,
    use_flash_attn: bool = True,
    sequence_parallel: bool = False,  # pylint: disable=W0613
 ):
    """
-    Builde model with config
+    Build model with config.
    Args:
        num_chunks (int): The number of partitions in pipeline parallel. 1 by default.
--- a/internlm/monitor/monitor.py
+++ b/internlm/monitor/monitor.py
@ -211,6 +211,14 @@ monitor_manager = MonitorManager()
@contextmanager
 def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
    """
    Initialize monitor manager for monitoring training lifetime and alerting exception info to Feishu.
    Args:
        job_name (str): The training job name.
        alert_address (str): The Feishu webhook address for sending alert messages.
    """
    if alert_address is not None:
        try:
            monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address)
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -48,9 +48,11 @@ logger = get_logger(__file__)
@llm_timeout(func_name="initialize_model")
 def initialize_model():
    """
-    Initialize model.
+    Initialize model with Automatic Mixed Precision.
-    Returns: The neural network model to be trained or evaluated.
+    Returns:
        torch.nn.Module:
            The neural network model to be trained or evaluated.
    """
    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
@ -96,9 +98,10 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    Initialize optimizer.
    Args:
-        model (torch.nn.Module): Your model instance to be trained or evaluated.
+        model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated.
-    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
+    Returns:
        A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
        param_bcast_sync_handler = ParamBcastSyncHandler(model)
@ -134,7 +137,14 @@ def get_train_data_loader(
    """
    Generate and return the training data loader.
-    Returns: A tuple of (train_dl, dataset_types).
+    Args:
        num_worker (:class:`int`): number of subprocesses used for dataloader.
        dataset_generate_func (:class:`Callable`, optional): generate function for dataset.
        train_sampler (:class:`torch.utils.data.sampler`, optional): dataset sampler for training dataloader.
        train_collate_fn (:class:`Callable`, optional): collate function for training dataloader.
    Returns:
        A tuple of (train_dl, dataset_types).
    """
    # Get the dataset types