mirror of https://github.com/InternLM/InternLM
				
				
				
			merge internlm/develop into feature_add_moe
						commit
						0af5175073
					
				| 
						 | 
				
			
			@ -16,6 +16,7 @@
 | 
			
		|||
 | 
			
		||||
[](./LICENSE)
 | 
			
		||||
[](https://github.com/internLM/OpenCompass/)
 | 
			
		||||
[](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest)
 | 
			
		||||
 | 
			
		||||
[📘使用法](./doc/en/usage.md) |
 | 
			
		||||
[🛠️インストール](./doc/en/install.md) |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@
 | 
			
		|||
 | 
			
		||||
[](https://github.com/open-mmlab/mmdetection/blob/main/LICENSE)
 | 
			
		||||
[](https://github.com/internLM/OpenCompass/)
 | 
			
		||||
[](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest)
 | 
			
		||||
 | 
			
		||||
[📘使用文档](./doc/usage.md) |
 | 
			
		||||
[🛠️安装教程](./doc/install.md) |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@
 | 
			
		|||
 | 
			
		||||
[](./LICENSE)
 | 
			
		||||
[](https://github.com/internLM/OpenCompass/)
 | 
			
		||||
[](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest)
 | 
			
		||||
 | 
			
		||||
[📘Usage](./doc/en/usage.md) |
 | 
			
		||||
[🛠️Installation](./doc/en/install.md) |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ msgid ""
 | 
			
		|||
msgstr ""
 | 
			
		||||
"Project-Id-Version: InternLM \n"
 | 
			
		||||
"Report-Msgid-Bugs-To: \n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-07 10:56+0800\n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-13 17:07+0800\n"
 | 
			
		||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 | 
			
		||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 | 
			
		||||
"Language: en\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -19,30 +19,33 @@ msgstr ""
 | 
			
		|||
"Content-Transfer-Encoding: 8bit\n"
 | 
			
		||||
"Generated-By: Babel 2.12.1\n"
 | 
			
		||||
 | 
			
		||||
#: ../../source/checkpoint.rst:2 09c8645fba264cdf9a80c4b62c2bb4d1
 | 
			
		||||
#: ../../source/checkpoint.rst:2
 | 
			
		||||
msgid "模型保存"
 | 
			
		||||
msgstr "Model Checkpointing"
 | 
			
		||||
 | 
			
		||||
#: ../../source/checkpoint.rst:4 8b158d34631045b1afdb4fb0169b3c71
 | 
			
		||||
#: ../../source/checkpoint.rst:4
 | 
			
		||||
msgid ""
 | 
			
		||||
"InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` "
 | 
			
		||||
"来管理模型保存。 其中,可以 使用 ``CheckpointManager.try_save_checkpoint(train_state)`` "
 | 
			
		||||
"来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。"
 | 
			
		||||
"来管理模型保存。其中,可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` "
 | 
			
		||||
"来保存指定 step 的模型状态。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation, "
 | 
			
		||||
"we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports "
 | 
			
		||||
"automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit."
 | 
			
		||||
"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to "
 | 
			
		||||
"manage model checkpointing. In the implementation, we use "
 | 
			
		||||
"``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint "
 | 
			
		||||
"training states at specific steps. "
 | 
			
		||||
 | 
			
		||||
#: ../../source/checkpoint.rst:8 a023b5a6d15749bfaa51cf2da194bda1
 | 
			
		||||
#: ../../source/checkpoint.rst:6
 | 
			
		||||
msgid "InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。"
 | 
			
		||||
msgstr "InternLM supports automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit. "
 | 
			
		||||
 | 
			
		||||
#: ../../source/checkpoint.rst:9
 | 
			
		||||
msgid "Checkpointing"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 938575c699d1426c87e0b3f589a85d50
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager:1 of
 | 
			
		||||
msgid "StorageManagerContext"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 754d6881cd034c5ebaab0f3362dd14c2
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:1 of
 | 
			
		||||
msgid ""
 | 
			
		||||
"Exit signal detection function, if we write the exit step in the "
 | 
			
		||||
| 
						 | 
				
			
			@ -51,34 +54,27 @@ msgid ""
 | 
			
		|||
"quit."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 2169f9fb4a8b40bc9bf6093894fc7a5e 6a55d2b2b24a44c8b78b40f19f4d950b
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training of
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
 | 
			
		||||
msgid "参数"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 360a89b1591e4627ac432f4d75050354
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
 | 
			
		||||
msgid "返回"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 2426832f4a8a4c5481be1c940e0e7b50
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:9 of
 | 
			
		||||
msgid "whether to quit."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 5f6842c261544a3c89f32d981b3ad755
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
 | 
			
		||||
msgid "返回类型"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 1392da84b6e645bcb8dab605e1231fdc
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.wait_async_upload_finish:1
 | 
			
		||||
#: of
 | 
			
		||||
msgid "wait for all checkpoint uploads to be completed"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: d1774593e9c94608b49b10504bfbc38b
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.query_latest_snapshot_step_boto3:1
 | 
			
		||||
#: of
 | 
			
		||||
msgid ""
 | 
			
		||||
| 
						 | 
				
			
			@ -86,38 +82,25 @@ msgid ""
 | 
			
		|||
"found, None will return."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: a3abbbd2bd574872892d908ab248e804
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:1 of
 | 
			
		||||
msgid "Attempt to restore the training state of the last ckpt."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: de021d1eb6d54955a2850c11c0191710
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:3 of
 | 
			
		||||
msgid "lr_scheduler object."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 20be15854f2e420a9d96c86b5869bfa6
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:5 of
 | 
			
		||||
msgid "optimizer object."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 68f69086c5054acc8aca15c8a764acc5
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:7 of
 | 
			
		||||
msgid "learning rate."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 5d34d34a972d4abeab4bda3e49ee157b
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:9 of
 | 
			
		||||
msgid "traing states."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 82ebb67afaa748ecabc4cef598d7fc30
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:11 of
 | 
			
		||||
msgid "traning dataloader object"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 0c95dfcd712749279daca78166bb4326
 | 
			
		||||
#: internlm.utils.model_checkpoint.CheckpointManager.save_checkpoint:1 of
 | 
			
		||||
msgid "Save checkpoint to the given folder path."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "Attempt to restore the training state of the last ckpt."
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "lr_scheduler object."
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "optimizer object."
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "learning rate."
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "traing states."
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "traning dataloader object"
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -37,8 +37,8 @@ msgstr "Start Training"
 | 
			
		|||
 | 
			
		||||
#: ../../source/example/30B_demo.rst:166 24974384d5ab42e68266aeb67ae222ce
 | 
			
		||||
msgid "完成以上训练配置后,可启动模型训练,以在 ``slurm`` 平台上为例,启动两节点 16GPU 的训练命令如下所示:"
 | 
			
		||||
msgstr "After completing the data preparation and relevant training configurations, you can start the demo training.
 | 
			
		||||
The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs."
 | 
			
		||||
msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. "
 | 
			
		||||
"The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs."
 | 
			
		||||
 | 
			
		||||
#: ../../source/example/30B_demo.rst:173 948ac71ed53848f9bad07f69d956c4bb
 | 
			
		||||
msgid "训练结果"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -37,8 +37,8 @@ msgstr "Start Training"
 | 
			
		|||
 | 
			
		||||
#: ../../source/example/7B_demo.rst:164 9e7a864ae2e14d05b0681f16792e5278
 | 
			
		||||
msgid "完成以上训练配置后,可启动模型训练,以在 ``slurm`` 平台上为例,启动单节点 8GPU 的训练命令如下所示:"
 | 
			
		||||
msgstr "After completing the data preparation and relevant training configurations, you can start the demo training.
 | 
			
		||||
The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs."
 | 
			
		||||
msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. "
 | 
			
		||||
"The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs."
 | 
			
		||||
 | 
			
		||||
#: ../../source/example/7B_demo.rst:171 fdd053efb1854d46aabf6c0f279fe7fc
 | 
			
		||||
msgid "训练结果"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ msgid ""
 | 
			
		|||
msgstr ""
 | 
			
		||||
"Project-Id-Version: InternLM \n"
 | 
			
		||||
"Report-Msgid-Bugs-To: \n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-08 15:32+0800\n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-14 12:23+0800\n"
 | 
			
		||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 | 
			
		||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 | 
			
		||||
"Language: zh_CN\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -23,24 +23,68 @@ msgstr ""
 | 
			
		|||
msgid "训练构建"
 | 
			
		||||
msgstr "Training Setup"
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:7
 | 
			
		||||
#: ../../source/initialize.rst:4
 | 
			
		||||
msgid "InternLM 的训练流程可以归纳为两个步骤:"
 | 
			
		||||
msgstr "The training process of InternLM can be summarized into two steps: "
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:6
 | 
			
		||||
msgid "初始化"
 | 
			
		||||
msgstr "Initialization"
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:8
 | 
			
		||||
msgid "初始化模型、优化器、数据加载器、Trainer,生成不同种类的进程组,为混合并行的迭代训练做准备。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Initialize model, optimizer, dataloader, trainer, and create different "
 | 
			
		||||
"types of process groups to prepare for iterative steps of hybrid parallel training. "
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:9
 | 
			
		||||
msgid "初始化Logger、Checkpoint管理器、Monitor管理器、Profiler,对迭代训练的过程观察、预警、记录。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Initialize logger, checkpoint manager, monitor manager, and profiler to "
 | 
			
		||||
"watch, alert, and record the iterative training steps. "
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:11
 | 
			
		||||
msgid "迭代训练"
 | 
			
		||||
msgstr "Iterative training steps"
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:13
 | 
			
		||||
msgid "根据配置文件定义的张量并行、流水线并行、数据并行的大小,加载训练引擎和调度器进行混合并行训练。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Load the training engine and scheduler for hybrid parallel training "
 | 
			
		||||
"according to the configuration such as tensor parallel size, pipeline "
 | 
			
		||||
"parallel size, and data parallel size. "
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:14
 | 
			
		||||
msgid "在迭代训练中,调用 Trainer API 进行梯度置零,前向传播计算损失并反向传播,参数更新。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"In iterative training steps, the Trainer API is called to perform zero "
 | 
			
		||||
"gradients, forward-loss-backward, and parameter update."
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:20
 | 
			
		||||
msgid "InternLM训练流程图"
 | 
			
		||||
msgstr "InternLM training process"
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:25
 | 
			
		||||
msgid "命令行参数解析"
 | 
			
		||||
msgstr "Argument Parsing"
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:9
 | 
			
		||||
#, fuzzy
 | 
			
		||||
#: ../../source/initialize.rst:27
 | 
			
		||||
msgid ""
 | 
			
		||||
"InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_"
 | 
			
		||||
" 库来向InternLM运行时提供命令行参数配置。用户可使用 "
 | 
			
		||||
"``internlm.initialize.get_default_parser()`` 来获取 InternLM "
 | 
			
		||||
"的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。"
 | 
			
		||||
" 库来向InternLM运行时提供命令行参数配置。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"InternLM uses the `argparse "
 | 
			
		||||
"<https://docs.python.org/3/library/argparse.html>`_ library to supply "
 | 
			
		||||
"commandline configuration to the InternLM runtime. Use "
 | 
			
		||||
"``internlm.initialize.get_default_parser()`` to get InternLM's default "
 | 
			
		||||
"parser with some builtin arguments, users can add custom parameters to "
 | 
			
		||||
"this parser."
 | 
			
		||||
"commandline configuration to the InternLM runtime. "
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:29
 | 
			
		||||
msgid ""
 | 
			
		||||
"用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM "
 | 
			
		||||
"的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Use ``internlm.initialize.get_default_parser()`` to get InternLM's "
 | 
			
		||||
"default parser with some builtin arguments, users can add custom "
 | 
			
		||||
"parameters to this parser."
 | 
			
		||||
 | 
			
		||||
#: internlm.initialize.launch.get_default_parser:1 of
 | 
			
		||||
msgid ""
 | 
			
		||||
| 
						 | 
				
			
			@ -69,7 +113,7 @@ msgstr ""
 | 
			
		|||
msgid "返回类型"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:25
 | 
			
		||||
#: ../../source/initialize.rst:45
 | 
			
		||||
msgid "模型初始化"
 | 
			
		||||
msgstr "Model Initialization"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -81,26 +125,26 @@ msgstr ""
 | 
			
		|||
msgid "The neural network model to be trained or evaluated."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:29
 | 
			
		||||
#: ../../source/initialize.rst:49
 | 
			
		||||
msgid "InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"InternLM uses the field ``model_type`` and ``model`` in the config file "
 | 
			
		||||
"to control model initialization process. An example model initialization "
 | 
			
		||||
"configuratio"
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:57
 | 
			
		||||
#: ../../source/initialize.rst:77
 | 
			
		||||
msgid "字段 ``model_type`` 指明了要初始化的模型类型"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"The field ``model_type`` specifics the model type has been registered and"
 | 
			
		||||
" to be initialized."
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:58
 | 
			
		||||
#: ../../source/initialize.rst:78
 | 
			
		||||
msgid "字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"The parameters in field ``model`` specific the configuration settings "
 | 
			
		||||
"during model initialization."
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:60
 | 
			
		||||
#: ../../source/initialize.rst:80
 | 
			
		||||
msgid ""
 | 
			
		||||
"值得注意的是,用户可以定义新的模型类型,并使用装饰器 ``@MODEL_INITIALIZER.register_module`` "
 | 
			
		||||
"注册模型的初始化函数,其中 ``MODEL_INITIALIZER`` 是类 "
 | 
			
		||||
| 
						 | 
				
			
			@ -112,7 +156,7 @@ msgstr ""
 | 
			
		|||
" instantiated object of class ``internlm.util.registry.Registry``, the "
 | 
			
		||||
"example is shown as follows."
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:72
 | 
			
		||||
#: ../../source/initialize.rst:92
 | 
			
		||||
msgid "优化器初始化"
 | 
			
		||||
msgstr "Optimizer Initialization"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -134,7 +178,7 @@ msgstr ""
 | 
			
		|||
msgid "A tuple of (optimizer, beta2_scheduler, lr_scheduler)."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:79
 | 
			
		||||
#: ../../source/initialize.rst:99
 | 
			
		||||
msgid "数据加载器初始化"
 | 
			
		||||
msgstr "Dataloader Initialization"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -162,7 +206,7 @@ msgstr ""
 | 
			
		|||
msgid "A tuple of (train_dl, dataset_types)."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: ../../source/initialize.rst:86
 | 
			
		||||
#: ../../source/initialize.rst:106
 | 
			
		||||
msgid "Trainer 初始化"
 | 
			
		||||
msgstr "Trainer Initialization"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ msgid ""
 | 
			
		|||
msgstr ""
 | 
			
		||||
"Project-Id-Version: InternLM \n"
 | 
			
		||||
"Report-Msgid-Bugs-To: \n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-08 15:32+0800\n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-14 11:05+0800\n"
 | 
			
		||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 | 
			
		||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 | 
			
		||||
"Language: en\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -32,13 +32,13 @@ msgid ""
 | 
			
		|||
"InternLM 使用 ``internlm.train.initialize_llm_profile()`` "
 | 
			
		||||
"来收集和分析模型训练或推理期间的性能数据,如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler "
 | 
			
		||||
"<https://pytorch.org/docs/stable/profiler.html>`_ ,输出的性能分析 trace 文件可以使用 "
 | 
			
		||||
"`tensorboard <https://www.tensorflow.org>`_ 进行可视化。"
 | 
			
		||||
"`tensorboard <https://www.tensorflow.org/tensorboard?hl=en>`_ 进行可视化。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"InternLM uses ``internlm.train.initialize_llm_profile()`` to profile "
 | 
			
		||||
"performance data, execution time duration and breakdown analysis of step "
 | 
			
		||||
"time. The implementation is based on `torch.profiler "
 | 
			
		||||
"<https://pytorch.org/docs/stable/profiler.html>`_ and output tracing "
 | 
			
		||||
"files can be visualized with `tensorboard <https://www.tensorflow.org>`_."
 | 
			
		||||
"files can be visualized with `tensorboard <https://www.tensorflow.org/tensorboard?hl=en>`_."
 | 
			
		||||
 | 
			
		||||
#: ../../source/profiler.rst:11
 | 
			
		||||
msgid ""
 | 
			
		||||
| 
						 | 
				
			
			@ -53,11 +53,15 @@ msgstr ""
 | 
			
		|||
 | 
			
		||||
#: ../../source/profiler.rst:13
 | 
			
		||||
msgid "实际运行生成的 ``Torch Profiler`` 目录结构如下:"
 | 
			
		||||
msgstr "The directory structure of ``Torch Profiler`` generated files is as follows:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"The directory structure of ``Torch Profiler`` generated files is as "
 | 
			
		||||
"follows:"
 | 
			
		||||
 | 
			
		||||
#: ../../source/profiler.rst:22
 | 
			
		||||
msgid "其中, ``traces`` 可以通过 ``TensorBoard`` 可视化,运行命令"
 | 
			
		||||
msgstr "Among them, ``traces`` can be visualized through ``TensorBoard`` and run with the command"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Among them, ``traces`` can be visualized through ``TensorBoard`` and run "
 | 
			
		||||
"with the command"
 | 
			
		||||
 | 
			
		||||
#: ../../source/profiler.rst:29
 | 
			
		||||
msgid ""
 | 
			
		||||
| 
						 | 
				
			
			@ -66,7 +70,12 @@ msgid ""
 | 
			
		|||
"tensorboard "
 | 
			
		||||
"<https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html"
 | 
			
		||||
"#pytorch-profiler-with-tensorboard>`_"
 | 
			
		||||
msgstr "In the opened ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` page, you can see the timeline of profiled operators and GPU kernels. For more usage, please refer to `torch profiler with tensorboard <https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard>`_"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"In the opened ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` page,"
 | 
			
		||||
" you can see the timeline of profiled operators and GPU kernels. For more"
 | 
			
		||||
" usage, please refer to `torch profiler with tensorboard "
 | 
			
		||||
"<https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html"
 | 
			
		||||
"#pytorch-profiler-with-tensorboard>`_"
 | 
			
		||||
 | 
			
		||||
#: internlm.train.training_internlm.initialize_llm_profile:1 of
 | 
			
		||||
msgid "Initialize and return the profiler context manager instance."
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ msgid ""
 | 
			
		|||
msgstr ""
 | 
			
		||||
"Project-Id-Version: InternLM \n"
 | 
			
		||||
"Report-Msgid-Bugs-To: \n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-07 10:56+0800\n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-14 12:23+0800\n"
 | 
			
		||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 | 
			
		||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 | 
			
		||||
"Language: en\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -19,109 +19,144 @@ msgstr ""
 | 
			
		|||
"Content-Transfer-Encoding: 8bit\n"
 | 
			
		||||
"Generated-By: Babel 2.12.1\n"
 | 
			
		||||
 | 
			
		||||
#: ../../source/training.rst:2 6eafa5eb08e040039309a39cdb0f1bfe
 | 
			
		||||
#: ../../source/training.rst:2
 | 
			
		||||
msgid "训练 API"
 | 
			
		||||
msgstr "Training API"
 | 
			
		||||
 | 
			
		||||
#: ../../source/training.rst:4 74d81f3d0ca54c839d4e80bd589aedb2
 | 
			
		||||
#: ../../source/training.rst:4
 | 
			
		||||
msgid ""
 | 
			
		||||
"InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` "
 | 
			
		||||
"管理。在定义了训练引擎和调度器之后,我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"InternLM training API is managed in ``internlm.core.trainer.Trainer``. After defining the "
 | 
			
		||||
"training engine and runtime scheduler, we can call training API to perform training, evaluation, "
 | 
			
		||||
"zero gradients and parameter update steps."
 | 
			
		||||
"InternLM training API is managed in ``internlm.core.trainer.Trainer``. "
 | 
			
		||||
"After defining the training engine and runtime scheduler, we can call "
 | 
			
		||||
"training API to perform training, evaluation, zero gradients and "
 | 
			
		||||
"parameter update steps."
 | 
			
		||||
 | 
			
		||||
#: ../../source/training.rst:6 0e0cfddbb2334d3da99d3289edf4161d
 | 
			
		||||
#: ../../source/training.rst:6
 | 
			
		||||
msgid "有关详细用法,请参阅 Trainer API 文档和示例。"
 | 
			
		||||
msgstr "For detailed usage, please refer to Trainer API documentation and examples."
 | 
			
		||||
msgstr ""
 | 
			
		||||
"For detailed usage, please refer to Trainer API documentation and "
 | 
			
		||||
"examples."
 | 
			
		||||
 | 
			
		||||
#: 7ea10280a8f1489984cb9994aa08976b internlm.core.trainer.Trainer:1 of
 | 
			
		||||
#: internlm.core.trainer.Trainer:1 of
 | 
			
		||||
msgid ""
 | 
			
		||||
"This is a class tending for easy deployments of users' training and "
 | 
			
		||||
"evaluation instead of writing their own scripts."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 7969dca55840451193bffd3b071ab3b3 aff576168b59460491bb5da0ce41ea74
 | 
			
		||||
#: internlm.core.trainer.Trainer internlm.core.trainer.Trainer.execute_schedule
 | 
			
		||||
#: of
 | 
			
		||||
msgid "参数"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 59754d3e9ee8452a872bf397c01e0d8c internlm.core.trainer.Trainer:4 of
 | 
			
		||||
#: internlm.core.trainer.Trainer:4 of
 | 
			
		||||
msgid "Engine responsible for the process function."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 2d18ff15256e48f98901c7a7e0cbbe35 internlm.core.trainer.Trainer:6 of
 | 
			
		||||
#: internlm.core.trainer.Trainer:6 of
 | 
			
		||||
msgid "Runtime schedule. Defaults to None."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 76f4b3c7feba40eca3ee2b32559c53f5 internlm.core.trainer.Trainer.engine:1 of
 | 
			
		||||
#: internlm.core.trainer.Trainer.engine:1 of
 | 
			
		||||
msgid ""
 | 
			
		||||
"Returns the engine that responsible for managing the training and "
 | 
			
		||||
"evaluation process."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: c7eae2d4d06c4ef891e314902d80b7f3 internlm.core.trainer.Trainer.schedule:1 of
 | 
			
		||||
#: internlm.core.trainer.Trainer.schedule:1 of
 | 
			
		||||
msgid "Returns the runtime scheduler."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: cb495b21b3444881aec83803e92386d9
 | 
			
		||||
#: internlm.core.trainer.Trainer.uses_pipeline:1 of
 | 
			
		||||
msgid "Returns whether the pipeline parallel is used or not."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 86b0b631189e46468281a397c5e97350 internlm.core.trainer.Trainer.train:1 of
 | 
			
		||||
#: internlm.core.trainer.Trainer.train:1 of
 | 
			
		||||
msgid "Sets the model to training mode."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: f997e13120ee4d8b9e45ea6698b3e2a6 internlm.core.trainer.Trainer.eval:1 of
 | 
			
		||||
#: internlm.core.trainer.Trainer.eval:1 of
 | 
			
		||||
msgid "Sets the model to evaluation mode."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: a8179e50312d47dcbe9de0433a65c2f7 internlm.core.trainer.Trainer.zero_grad:1
 | 
			
		||||
#: of
 | 
			
		||||
#: internlm.core.trainer.Trainer.zero_grad:1 of
 | 
			
		||||
msgid "Sets the gradient of all parameters in the model to zero."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: f936136ef9e0452ca439b7c66dc8884b internlm.core.trainer.Trainer.step:1 of
 | 
			
		||||
#: internlm.core.trainer.Trainer.step:1 of
 | 
			
		||||
msgid "Executes the parameter update step."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 250e2af89cfd432c84d228f9e03c174c
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule:1 of
 | 
			
		||||
msgid ""
 | 
			
		||||
"Runs the forward, loss computation, and backward for the model. Returns a"
 | 
			
		||||
" tuple of (output, label, loss)."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 6ca7de83033b432792eb0d7935ea04da
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule:4 of
 | 
			
		||||
msgid "The data iterator."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 6d3044e75b3149beba3c659e15607b79
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule:6 of
 | 
			
		||||
msgid "Additional keyword arguments."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 99d5a297d6414c30b432acf2566f0d3c
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule of
 | 
			
		||||
msgid "返回"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: b625ebf0cf874edba384456d33e740b4
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule:8 of
 | 
			
		||||
msgid "A tuple of (output, label, loss)."
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: 391cde57d2e2478d8f83a7ad270c2a65
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule of
 | 
			
		||||
msgid "返回类型"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: d4c4fb0fbddb499786970509cf0c9e13
 | 
			
		||||
#: internlm.core.trainer.Trainer.execute_schedule:9 of
 | 
			
		||||
msgid "Tuple[:class:`torch.Tensor`]"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#~ msgid "InternLM 的训练流程可以归纳为两个步骤:"
 | 
			
		||||
#~ msgstr "The training process of InternLM can be summarized into two steps: "
 | 
			
		||||
 | 
			
		||||
#~ msgid "初始化"
 | 
			
		||||
#~ msgstr "Initialization"
 | 
			
		||||
 | 
			
		||||
#~ msgid "初始化模型、优化器、数据加载器、Trainer,生成不同种类的进程组,为混合并行的迭代训练做准备。"
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
#~ "Initialize model, optimizer, dataloader, "
 | 
			
		||||
#~ "trainer, and create different types of"
 | 
			
		||||
#~ " process groups to prepare for "
 | 
			
		||||
#~ "iterative steps of hybrid parallel "
 | 
			
		||||
#~ "training. "
 | 
			
		||||
 | 
			
		||||
#~ msgid "初始化Logger、Checkpoint管理器、Monitor管理器、Profiler,对迭代训练的过程观察、预警、记录。"
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
#~ "Initialize logger, checkpoint manager, monitor"
 | 
			
		||||
#~ " manager, and profiler to watch, "
 | 
			
		||||
#~ "alert, and record the iterative training"
 | 
			
		||||
#~ " steps. "
 | 
			
		||||
 | 
			
		||||
#~ msgid "迭代训练"
 | 
			
		||||
#~ msgstr "Iterative training steps"
 | 
			
		||||
 | 
			
		||||
#~ msgid "根据配置文件定义的张量并行、流水线并行、数据并行的大小,加载训练引擎和调度器进行混合并行训练。"
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
#~ "Load the training engine and scheduler"
 | 
			
		||||
#~ " for hybrid parallel training according "
 | 
			
		||||
#~ "to the configuration such as tensor "
 | 
			
		||||
#~ "parallel size, pipeline parallel size, "
 | 
			
		||||
#~ "and data parallel size. "
 | 
			
		||||
 | 
			
		||||
#~ msgid "在迭代训练中,调用 Trainer API 进行梯度置零,前向传播计算损失并反向传播,参数更新。"
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
#~ "In iterative training steps, the Trainer"
 | 
			
		||||
#~ " API is called to perform zero "
 | 
			
		||||
#~ "gradients, forward-loss-backward, and "
 | 
			
		||||
#~ "parameter update."
 | 
			
		||||
 | 
			
		||||
#~ msgid "InternLM训练流程图"
 | 
			
		||||
#~ msgstr "InternLM training process"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ msgid ""
 | 
			
		|||
msgstr ""
 | 
			
		||||
"Project-Id-Version: InternLM \n"
 | 
			
		||||
"Report-Msgid-Bugs-To: \n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-07 14:15+0800\n"
 | 
			
		||||
"POT-Creation-Date: 2023-09-11 14:25+0800\n"
 | 
			
		||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 | 
			
		||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 | 
			
		||||
"Language: en\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -19,11 +19,11 @@ msgstr ""
 | 
			
		|||
"Content-Transfer-Encoding: 8bit\n"
 | 
			
		||||
"Generated-By: Babel 2.12.1\n"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:2 a64aaaa1525e4e01b0ddcebc42c24bbd
 | 
			
		||||
#: ../../../usage.md:2
 | 
			
		||||
msgid "使用教程"
 | 
			
		||||
msgstr "Quickstart Guide"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:4 f1b40737fb584d889b82c7f55b652977
 | 
			
		||||
#: ../../../usage.md:4
 | 
			
		||||
msgid ""
 | 
			
		||||
"启动一个 Demo "
 | 
			
		||||
"模型训练,需要进行三项准备,**安装**,**数据集准备**和**模型训练配置**。接下来,首先会介绍数据准备相关的操作,再简要描述模型训练配置相关的内容。"
 | 
			
		||||
| 
						 | 
				
			
			@ -33,21 +33,21 @@ msgstr ""
 | 
			
		|||
"configuration**. In this guide, we will first cover the steps for dataset"
 | 
			
		||||
" preparation and then briefly describe the model training configuration."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:6 b35abe307c2f4d23866fff828308ebf2
 | 
			
		||||
#: ../../../usage.md:6
 | 
			
		||||
msgid "安装"
 | 
			
		||||
msgstr "Installation"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:7 64a8c1f5f71c45519e636aa7edba10bc
 | 
			
		||||
#: ../../../usage.md:7
 | 
			
		||||
msgid "请参考[安装文档](./install.md)进行安装。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Please refer to the [installation guide](./install.md) for instructions "
 | 
			
		||||
"on how to install the necessary dependencies."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:9 bd96714d12ee415794dea5a4578bd8cd
 | 
			
		||||
#: ../../../usage.md:9
 | 
			
		||||
msgid "数据准备 (预训练)"
 | 
			
		||||
msgstr "Dataset Preparation (Pre-training)"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:11 5a0b39fb9da94e96b87db40d1f231a0c
 | 
			
		||||
#: ../../../usage.md:11
 | 
			
		||||
msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型,可直接修改`tokernizer.py`中的模型参数路径。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"The dataset for the InternLM training task includes a series of `bin` and"
 | 
			
		||||
| 
						 | 
				
			
			@ -58,7 +58,7 @@ msgstr ""
 | 
			
		|||
"different model, you can directly modify the model parameter path in "
 | 
			
		||||
"`tokenizer.py`."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:13 3cef8126b8784af48d81cc140322909e
 | 
			
		||||
#: ../../../usage.md:13
 | 
			
		||||
msgid "可以运行以下命令生成原始数据对应的`bin`和`meta`文件,其中参数`text_input_path`表示原始文本数据路径,目前支持`txt`、`json`和`jsonl`三种输入格式,`bin_output_path`表示生成的`bin`文件的保存路径。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"You can run the following command to generate `bin` and `meta` files "
 | 
			
		||||
| 
						 | 
				
			
			@ -67,30 +67,30 @@ msgstr ""
 | 
			
		|||
"`txt`, `json`, and `jsonl` formats, while `bin_output_path` represents "
 | 
			
		||||
"the save path of the generated `bin` files."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:18 107ff2280da14cb6a27f4e9857186333
 | 
			
		||||
#: ../../../usage.md:18
 | 
			
		||||
msgid "下面是一个数据处理的例子:"
 | 
			
		||||
msgstr "Here is an example of data processing:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:20 c11a9860263c4e2288a561f3435fa706
 | 
			
		||||
#: ../../../usage.md:20
 | 
			
		||||
msgid "给定一个包含原始数据集的文件`raw_data.txt`,原始数据集如下所示:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Given a file `raw_data.txt` containing the raw dataset, the raw dataset "
 | 
			
		||||
"is shown below:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:27 4012599b42ab47bd979d2a0b79ca1147
 | 
			
		||||
#: ../../../usage.md:27
 | 
			
		||||
msgid "可以通过运行以下命令来生成`bin`和`meta`文件:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"You can generate the `bin` and `meta` files by running the following "
 | 
			
		||||
"command:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:32 cca91b6cf53a4082932dd34ea4b7f954
 | 
			
		||||
#: ../../../usage.md:32
 | 
			
		||||
msgid "需要注意的是,生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这六个目录下,以区分数据集的类型。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"It should be noted that the generated `bin` files need to be saved in one"
 | 
			
		||||
" of the following directories: `cn`, `en`, `code`, `ja`, `ar`, or "
 | 
			
		||||
"`kaoshi`, depending on the type of dataset."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:34 417312ca1e35479e811953f777e3565a
 | 
			
		||||
#: ../../../usage.md:34
 | 
			
		||||
msgid "其中,`cn`表示中文数据集;`en`表示英文数据集;`code`表示代码数据集;`ja`表示日语数据集;`ar`表示阿拉伯语数据集;`kaoshi`表示考试数据集。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Here, `cn` represents the Chinese dataset, `en` represents the English "
 | 
			
		||||
| 
						 | 
				
			
			@ -98,22 +98,22 @@ msgstr ""
 | 
			
		|||
" dataset, `ar` represents the Arabic dataset, and `kaoshi` represents the"
 | 
			
		||||
" exam dataset."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:36 79c21f8e89b34499ba4e25e20593ec28
 | 
			
		||||
#: ../../../usage.md:36
 | 
			
		||||
msgid "生成的bin文件的格式如下:"
 | 
			
		||||
msgstr "The format of the generated `bin` files is as follows:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:42 26388d996c4e4116bc216be9bc007f62
 | 
			
		||||
#: ../../../usage.md:42
 | 
			
		||||
msgid "`bin`文件中的每一行均对应原始数据集中的每一个句子,表示每个句子的`token`(下文将用sequence指定)。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Each line in the `bin` file corresponds to each sentence in the original "
 | 
			
		||||
"dataset, representing the tokens of each sentence (referred to as "
 | 
			
		||||
"sequence below)."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:44 b39148a85ee64a349975d26282fbe59b
 | 
			
		||||
#: ../../../usage.md:44
 | 
			
		||||
msgid "生成的`meta`文件的格式如下:"
 | 
			
		||||
msgstr "The format of the generated `meta` file is as follows:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:48 175a6007197a40568535f945672e5df2
 | 
			
		||||
#: ../../../usage.md:48
 | 
			
		||||
msgid ""
 | 
			
		||||
"在`meta`文件中,每个元组对应着`bin`文件中每一个`sequence`的元信息。其中,元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting"
 | 
			
		||||
" index`,第二个元素表示每个`sequence`中有多少个`tokens`。"
 | 
			
		||||
| 
						 | 
				
			
			@ -123,7 +123,7 @@ msgstr ""
 | 
			
		|||
"index` of each `sequence` among all `sequences`, and the second element "
 | 
			
		||||
"indicates the number of `tokens` for each `sequence`."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:50 46874a3de3924837979f9949f1237e39
 | 
			
		||||
#: ../../../usage.md:50
 | 
			
		||||
msgid ""
 | 
			
		||||
"例如,对于第一个`sequence`,`starting index`为 0,有 11 "
 | 
			
		||||
"个`tokens`;对于第二个`sequence`,由于第一个`sequence`转换为`string`后的长度为`89`,因此它的`starting"
 | 
			
		||||
| 
						 | 
				
			
			@ -132,17 +132,17 @@ msgstr ""
 | 
			
		|||
"For example, the first `sequence` starts at index 0 and has 16 `tokens`. "
 | 
			
		||||
"The second `sequence` starts at index 110 and has 24 `tokens`."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:52 25ea049fa411408b8856e7aa657835ab
 | 
			
		||||
#: ../../../usage.md:52
 | 
			
		||||
msgid "`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致,此处不再赘叙。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"The `bin` and `meta` file formats for `json` and `jsonl` type files are "
 | 
			
		||||
"the same as for `txt`, so we won't go over them here."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:54 bc52f959cb57494483a181e843014ed1
 | 
			
		||||
#: ../../../usage.md:54
 | 
			
		||||
msgid "数据准备 (微调)"
 | 
			
		||||
msgstr "Data Preparation (Fine-tuning)"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:56 73c74620c2994486acc747ba0c7f0b46
 | 
			
		||||
#: ../../../usage.md:56
 | 
			
		||||
msgid ""
 | 
			
		||||
"微调任务的数据集格式与预训练任务保持一致,生成的数据格式为一系列的`bin`和`meta`文件。以下以 Alpaca "
 | 
			
		||||
"数据集为例,介绍微调的数据准备流程。"
 | 
			
		||||
| 
						 | 
				
			
			@ -152,7 +152,7 @@ msgstr ""
 | 
			
		|||
"the Alpaca dataset as an example to explain the data preparation process "
 | 
			
		||||
"for fine-tuning."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:58 75f0e22d10ca413389ec8b947ae6141f
 | 
			
		||||
#: ../../../usage.md:58
 | 
			
		||||
msgid ""
 | 
			
		||||
"下载 [Alpaca 数据集](https://github.com/tatsu-"
 | 
			
		||||
"lab/stanford_alpaca/blob/main/alpaca_data.json)"
 | 
			
		||||
| 
						 | 
				
			
			@ -160,87 +160,87 @@ msgstr ""
 | 
			
		|||
"Download the [Alpaca dataset](https://github.com/tatsu-"
 | 
			
		||||
"lab/stanford_alpaca/blob/main/alpaca_data.json)."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:60 667606fcea454af48353a5b40f82fc46
 | 
			
		||||
#: ../../../usage.md:60
 | 
			
		||||
msgid "对 Alpaca 数据进行 tokenize,使用以下命令"
 | 
			
		||||
msgstr "Tokenize the Alpaca dataset using the following command:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:66 60283b9237c8462ea37288b8ece79081
 | 
			
		||||
#: ../../../usage.md:66
 | 
			
		||||
msgid "建议用户参考 alpaca_tokenizer.py 编写新的脚本对自己的数据集进行 tokenize"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"It is recommended that users refer to alpaca_tokenizer.py to write new "
 | 
			
		||||
"scripts to tokenize their own datasets"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:68 cdf45a4de9874e9fb65f7104dcee3c61
 | 
			
		||||
#: ../../../usage.md:68
 | 
			
		||||
msgid "训练配置"
 | 
			
		||||
msgstr "Training Configuration"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:70 7c42ebc23246450cbc1270e1461b16f6
 | 
			
		||||
msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例,介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。"
 | 
			
		||||
#: ../../../usage.md:70
 | 
			
		||||
#, fuzzy
 | 
			
		||||
msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Taking the configuration file `configs/7B_sft.py` for the 7B demo as an "
 | 
			
		||||
"example, let's discuss the data, model, and parallel configurations "
 | 
			
		||||
"example,"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:237
 | 
			
		||||
msgid "接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"let's discuss the data, model, parallel and monitoring configurations "
 | 
			
		||||
"required to start a model training."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:72 247cfe98a7f44c2293aa2e2351f1ea69
 | 
			
		||||
#: ../../../usage.md:239
 | 
			
		||||
msgid "数据配置"
 | 
			
		||||
msgstr "Data Configuration"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:73 31327e7dce5848778db5361b3fbded1c
 | 
			
		||||
#: ../../../usage.md:240
 | 
			
		||||
msgid "数据相关的关键参数配置及释义如下所示:"
 | 
			
		||||
msgstr "Here are the key parameters and their explanations for data configuration:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:88 4d2608136fef4141bd6e47f78b8591b2
 | 
			
		||||
#: ../../../usage.md:255
 | 
			
		||||
msgid ""
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:88 c5acb028f2694712b2af788a864d5927
 | 
			
		||||
#: ../../../usage.md:255
 | 
			
		||||
msgid "pack_into_one"
 | 
			
		||||
msgstr ""
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:91 db6b9ce8e8294952845893dd7aad098f
 | 
			
		||||
#: ../../../usage.md:258
 | 
			
		||||
msgid "目前支持传入数据集文件路径`train_folder`,且要求文件格式如下:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Currently, it supports passing the dataset file path `train_folder`, and "
 | 
			
		||||
"the file format is required to be as follows:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:98 f22536fc3dfa4552a103a7cb57a20f92
 | 
			
		||||
#: ../../../usage.md:265
 | 
			
		||||
msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"For detailed information about the dataset, please refer to the \"Data "
 | 
			
		||||
"Preparation\" section."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:100 bc4f0b06e9c24730a7a831b7aca417e2
 | 
			
		||||
#: ../../../usage.md:267
 | 
			
		||||
msgid "模型配置"
 | 
			
		||||
msgstr "Model Configuration"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:102 ecf278a0a851496fae2e49c436e59368
 | 
			
		||||
#: ../../../usage.md:269
 | 
			
		||||
msgid "如果在启动训练时要加载模型 `checkpoint`,可进行如下相关配置:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"If you want to load a model checkpoint when starting the training, you "
 | 
			
		||||
"can configure it as follows:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:115 38244aba74294067a4019d0777621746
 | 
			
		||||
#: ../../../usage.md:282
 | 
			
		||||
msgid "注意:"
 | 
			
		||||
msgstr "Note:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:116 19d1eb0a797f4bd9a702a00e525d7753
 | 
			
		||||
msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"`load_model_only_folder` and `load_ckpt_folder` cannot be set at the same"
 | 
			
		||||
" time."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:117 3ea27a1f6be044a3959890be69311b24
 | 
			
		||||
#: ../../../usage.md:283
 | 
			
		||||
msgid "路径若以 `local:` 为前缀,则存储在本地文件系统;若以 `boto3:` 为前缀,则存储在远程 oss 上"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"If the path starts with `local:`, it means the file is stored in the "
 | 
			
		||||
"local file system. If it starts with `boto3:`, it means the file is "
 | 
			
		||||
"stored in the remote OSS."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:119 1d6381b4cfff41d8bdd5347e8a135869
 | 
			
		||||
#: ../../../usage.md:285
 | 
			
		||||
msgid "模型相关关键参数配置如下所示:"
 | 
			
		||||
msgstr "The configuration for the model is as follows:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:143 1026791c9f054576857ef1930db6b167
 | 
			
		||||
#: ../../../usage.md:309
 | 
			
		||||
msgid "注意:用户可自定义模型类型名和模型结构,并配置相对应的模型参数。通过`utils/registry.py`下的`MODEL_INITIALIZER`对象进行模型初始化函数接口注册,在训练主函数`train.py`中初始化模型时,可通过`model_type`配置获取指定的模型初始化接口函数。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Note: Users can customize the model type name and model structure, and "
 | 
			
		||||
| 
						 | 
				
			
			@ -251,7 +251,7 @@ msgstr ""
 | 
			
		|||
"interface function can be obtained through the `model_type` "
 | 
			
		||||
"configuration."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:145 34823bcbe7754190bc9747758c1aad0c
 | 
			
		||||
#: ../../../usage.md:311
 | 
			
		||||
msgid ""
 | 
			
		||||
"*如果基于 InternLM 7B继续训练,可以参考 "
 | 
			
		||||
"[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 "
 | 
			
		||||
| 
						 | 
				
			
			@ -261,79 +261,76 @@ msgstr ""
 | 
			
		|||
"OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-"
 | 
			
		||||
"zoo) to download weights*."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:147 4cabc928f8884cd38a6bb683b3bfade3
 | 
			
		||||
#: ../../../usage.md:313
 | 
			
		||||
msgid "并行配置"
 | 
			
		||||
msgstr "Parallel Configuration"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:149 f97ade07340340959345e73567bae793
 | 
			
		||||
#: ../../../usage.md:315
 | 
			
		||||
msgid "训练并行配置样例如下:"
 | 
			
		||||
msgstr "Training parallel configuration example:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:158 87fb5a4e4a4047ee8a9b8bb43915636d
 | 
			
		||||
#: ../../../usage.md:324
 | 
			
		||||
msgid "zero1:zero 并行策略,分如下三种情况,默认值为 -1"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"zero1: zero parallel strategy, divided into the following three cases, "
 | 
			
		||||
"default value is -1"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:159 58dc08e2c52e4aaba99b4fbb6cf2e8b4
 | 
			
		||||
#, fuzzy
 | 
			
		||||
#: ../../../usage.md:325
 | 
			
		||||
msgid "当`zero1 <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"When `zero1 <= 0`, the size of the zero1 process group is equal to the "
 | 
			
		||||
"size of the data parallel process group, so the optimizer state "
 | 
			
		||||
"parameters will be split within the data parallel range."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:160 67e2ebd795d840b29fd1d684a068e90d
 | 
			
		||||
#, fuzzy
 | 
			
		||||
#: ../../../usage.md:326
 | 
			
		||||
msgid "当`zero1 == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain "
 | 
			
		||||
"the complete optimizer state parameters."
 | 
			
		||||
"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain"
 | 
			
		||||
" the complete optimizer state parameters."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:161 7caedfc943514b9b83090b858ef6d163
 | 
			
		||||
#, fuzzy
 | 
			
		||||
#: ../../../usage.md:327
 | 
			
		||||
msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process"
 | 
			
		||||
" group is a subset of the data parallel process group."
 | 
			
		||||
"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 "
 | 
			
		||||
"process group is a subset of the data parallel process group."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:162 b38d3a1f72d543c6a44728fb6babea6b
 | 
			
		||||
#: ../../../usage.md:328
 | 
			
		||||
msgid "tensor:张量并行大小,通常是每个节点的 GPU 数量,默认值为 1"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"tensor: tensor parallel size, usually the number of GPUs per node, "
 | 
			
		||||
"default is 1"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:163 237ac76df68f4a999396dad37c5495c3
 | 
			
		||||
#: ../../../usage.md:329
 | 
			
		||||
msgid "pipeline:流水线并行策略"
 | 
			
		||||
msgstr "pipeline: pipeline parallel strategy"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:164 c8c38f6ab2ea432eb9ebbb62618ca33e
 | 
			
		||||
#: ../../../usage.md:330
 | 
			
		||||
msgid "size:流水线并行大小,默认值为 1"
 | 
			
		||||
msgstr "size: pipeline parallel size, the default value is 1"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:165 b9158818e72e49acbdd52ad317cb80df
 | 
			
		||||
#: ../../../usage.md:331
 | 
			
		||||
msgid "interleaved_overlap:bool 类型,交错式调度时,开启或关闭通信优化,默认值为关闭"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"interleaved_overlap: bool type, when interleaved scheduling, enable or "
 | 
			
		||||
"disable communication optimization, the default value is False"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:166 28e4d48661ff4f80aff788fdda604433
 | 
			
		||||
#: ../../../usage.md:332
 | 
			
		||||
msgid "sequence_parallel:是否开启序列化并行,默认值为 False"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"sequence_parallel: Whether to enable sequence parallelism, the default "
 | 
			
		||||
"value is False"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:168 27528ab826824d2280506460e1f2f7bd
 | 
			
		||||
#: ../../../usage.md:334
 | 
			
		||||
msgid "注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Note: `Data parallel size = Total number of GPUs / Pipeline parallel size"
 | 
			
		||||
" / Tensor parallel size`"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:170 5a7af23cec604f1d9096a5ab81993c87
 | 
			
		||||
#: ../../../usage.md:336
 | 
			
		||||
msgid "启动训练"
 | 
			
		||||
msgstr "Start Training"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:172 795e51542ed84cea83b63c5233bb88bc
 | 
			
		||||
#: ../../../usage.md:338
 | 
			
		||||
msgid "完成了以上数据集准备和相关训练配置后,可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例,介绍训练启动方式。"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"After completing the data preparation and relevant training "
 | 
			
		||||
| 
						 | 
				
			
			@ -341,25 +338,30 @@ msgstr ""
 | 
			
		|||
"following examples demonstrate how to start the training in both slurm "
 | 
			
		||||
"and torch environments."
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:174 96402cbe443044c0a0a1695c9847140b
 | 
			
		||||
#: ../../../usage.md:340
 | 
			
		||||
msgid "若在 slurm 上启动分布式运行环境,多节点 16 卡的运行命令如下所示:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"If you want to start distributed training on slurm with 16 GPUs across "
 | 
			
		||||
"multiple nodes, use the following command:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:179 c569e60401a6471eb9af2473acc4d5a6
 | 
			
		||||
#: ../../../usage.md:345
 | 
			
		||||
msgid "若在 torch 上启动分布式运行环境,单节点 8 卡的运行命令如下所示:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"If you want to start distributed training on torch with 8 GPUs on a "
 | 
			
		||||
"single node, use the following command:"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:184 a045a060d0734aab9d894aed553cef34
 | 
			
		||||
#: ../../../usage.md:350
 | 
			
		||||
msgid "运行结果"
 | 
			
		||||
msgstr "Training Results"
 | 
			
		||||
 | 
			
		||||
#: ../../../usage.md:186 c68e8dfa259647c7a6e6e0c0446b0b18
 | 
			
		||||
#: ../../../usage.md:352
 | 
			
		||||
msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例,训练结果日志展示如下:"
 | 
			
		||||
msgstr ""
 | 
			
		||||
"Taking the configuration of the demo training on a single machine with 8 "
 | 
			
		||||
"GPUs on slurm as an example, the training result log is shown below:"
 | 
			
		||||
 | 
			
		||||
#~ msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
 | 
			
		||||
#~ msgstr ""
 | 
			
		||||
#~ "`load_model_only_folder` and `load_ckpt_folder` "
 | 
			
		||||
#~ "cannot be set at the same time."
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,8 +1,9 @@
 | 
			
		|||
模型保存
 | 
			
		||||
===================
 | 
			
		||||
 | 
			
		||||
InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。 其中,可以
 | 
			
		||||
使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。
 | 
			
		||||
InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。其中,可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。
 | 
			
		||||
 | 
			
		||||
InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。
 | 
			
		||||
 | 
			
		||||
Checkpointing
 | 
			
		||||
-------------
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -72,14 +72,14 @@ exclude_patterns = []
 | 
			
		|||
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 | 
			
		||||
 | 
			
		||||
html_theme = "sphinx_rtd_theme"
 | 
			
		||||
html_static_path = ["_static"]
 | 
			
		||||
html_static_path = []
 | 
			
		||||
 | 
			
		||||
# GitHub integration
 | 
			
		||||
html_context = {
 | 
			
		||||
    "display_github": True,
 | 
			
		||||
    "github_user": "InternLM",
 | 
			
		||||
    "github_repo": "InternLM",
 | 
			
		||||
    "github_version": "master",
 | 
			
		||||
    "github_version": "main",
 | 
			
		||||
    "conf_py_path": "/doc/code-docs/source/",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,12 +1,32 @@
 | 
			
		|||
训练构建
 | 
			
		||||
==============
 | 
			
		||||
 | 
			
		||||
InternLM 的训练流程可以归纳为两个步骤:
 | 
			
		||||
 | 
			
		||||
1. 初始化
 | 
			
		||||
 | 
			
		||||
    * 初始化模型、优化器、数据加载器、Trainer,生成不同种类的进程组,为混合并行的迭代训练做准备。
 | 
			
		||||
    * 初始化Logger、Checkpoint管理器、Monitor管理器、Profiler,对迭代训练的过程观察、预警、记录。
 | 
			
		||||
 | 
			
		||||
2. 迭代训练
 | 
			
		||||
   
 | 
			
		||||
    * 根据配置文件定义的张量并行、流水线并行、数据并行的大小,加载训练引擎和调度器进行混合并行训练。
 | 
			
		||||
    * 在迭代训练中,调用 Trainer API 进行梯度置零,前向传播计算损失并反向传播,参数更新。
 | 
			
		||||
 | 
			
		||||
.. figure:: ../../imgs/hybrid_parallel_training.png
 | 
			
		||||
  :scale: 45%
 | 
			
		||||
  :class: with-border
 | 
			
		||||
 | 
			
		||||
  InternLM训练流程图
 | 
			
		||||
 | 
			
		||||
.. _InternLM-args:
 | 
			
		||||
 | 
			
		||||
命令行参数解析
 | 
			
		||||
----------------
 | 
			
		||||
 | 
			
		||||
InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_ 库来向InternLM运行时提供命令行参数配置。用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。
 | 
			
		||||
InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_ 库来向InternLM运行时提供命令行参数配置。
 | 
			
		||||
 | 
			
		||||
用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。
 | 
			
		||||
 | 
			
		||||
.. code-block:: python
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,7 @@
 | 
			
		|||
Torch Profiler
 | 
			
		||||
-----------------
 | 
			
		||||
 | 
			
		||||
InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据,如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ ,输出的性能分析 trace 文件可以使用 `tensorboard <https://www.tensorflow.org>`_ 进行可视化。
 | 
			
		||||
InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据,如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ ,输出的性能分析 trace 文件可以使用 `tensorboard <https://www.tensorflow.org/tensorboard?hl=en>`_ 进行可视化。
 | 
			
		||||
 | 
			
		||||
用户如果想使用这个 torch 性能分析工具,需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch 性能分析后,用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` 文件夹中看到性能分析结果。
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,2 +1,2 @@
 | 
			
		|||
问&答
 | 
			
		||||
====
 | 
			
		||||
=====
 | 
			
		||||
							
								
								
									
										168
									
								
								doc/en/usage.md
								
								
								
								
							
							
						
						
									
										168
									
								
								doc/en/usage.md
								
								
								
								
							| 
						 | 
				
			
			@ -74,7 +74,173 @@ It is recommended that users refer to alpaca_tokenizer.py to write new scripts t
 | 
			
		|||
 | 
			
		||||
### Training Configuration
 | 
			
		||||
 | 
			
		||||
Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, and parallel configurations required to start a model training.
 | 
			
		||||
Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, parallel and monitoring configurations required to start a model training.
 | 
			
		||||
```python
 | 
			
		||||
JOB_NAME = "7b_train"
 | 
			
		||||
DO_ALERT = False
 | 
			
		||||
 | 
			
		||||
SEQ_LEN = 2048
 | 
			
		||||
HIDDEN_SIZE = 4096
 | 
			
		||||
NUM_ATTENTION_HEAD = 32
 | 
			
		||||
MLP_RATIO = 8 / 3
 | 
			
		||||
NUM_LAYER = 32
 | 
			
		||||
VOCAB_SIZE = 103168
 | 
			
		||||
 | 
			
		||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 | 
			
		||||
# Ckpt folder format:
 | 
			
		||||
# fs: 'local:/mnt/nfs/XXX'
 | 
			
		||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
 | 
			
		||||
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
 | 
			
		||||
 | 
			
		||||
# boto3 Ckpt folder format:
 | 
			
		||||
# import os
 | 
			
		||||
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
 | 
			
		||||
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
 | 
			
		||||
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
 | 
			
		||||
CHECKPOINT_EVERY = 50
 | 
			
		||||
ckpt = dict(
 | 
			
		||||
    enable_save_ckpt=False,  # enable ckpt save.
 | 
			
		||||
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
 | 
			
		||||
    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
 | 
			
		||||
    load_ckpt_folder="local:llm_ckpts/",
 | 
			
		||||
    # 'load_ckpt_info' setting guide:
 | 
			
		||||
    # 1. the 'path' indicate ckpt path,
 | 
			
		||||
    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
 | 
			
		||||
    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
 | 
			
		||||
    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
 | 
			
		||||
    checkpoint_every=CHECKPOINT_EVERY,
 | 
			
		||||
    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
 | 
			
		||||
    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
 | 
			
		||||
    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
TRAIN_FOLDER = "/path/to/dataset"
 | 
			
		||||
VALID_FOLDER = "/path/to/dataset"
 | 
			
		||||
data = dict(
 | 
			
		||||
    seq_len=SEQ_LEN,
 | 
			
		||||
    # micro_num means the number of micro_batch contained in one gradient update
 | 
			
		||||
    micro_num=4,
 | 
			
		||||
    # packed_length = micro_bsz * SEQ_LEN
 | 
			
		||||
    micro_bsz=2,
 | 
			
		||||
    # defaults to the value of micro_num
 | 
			
		||||
    valid_micro_num=4,
 | 
			
		||||
    # defaults to 0, means disable evaluate
 | 
			
		||||
    valid_every=50,
 | 
			
		||||
    pack_sample_into_one=False,
 | 
			
		||||
    total_steps=50000,
 | 
			
		||||
    skip_batches="",
 | 
			
		||||
    rampup_batch_size="",
 | 
			
		||||
    # Datasets with less than 50 rows will be discarded
 | 
			
		||||
    min_length=50,
 | 
			
		||||
    # train_folder=TRAIN_FOLDER,
 | 
			
		||||
    # valid_folder=VALID_FOLDER,
 | 
			
		||||
    empty_cache_and_diag_interval=10,
 | 
			
		||||
    diag_outlier_ratio=1.1,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
grad_scaler = dict(
 | 
			
		||||
    fp16=dict(
 | 
			
		||||
        # the initial loss scale, defaults to 2**16
 | 
			
		||||
        initial_scale=2**16,
 | 
			
		||||
        # the minimum loss scale, defaults to None
 | 
			
		||||
        min_scale=1,
 | 
			
		||||
        # the number of steps to increase loss scale when no overflow occurs
 | 
			
		||||
        growth_interval=1000,
 | 
			
		||||
    ),
 | 
			
		||||
    # the multiplication factor for increasing loss scale, defaults to 2
 | 
			
		||||
    growth_factor=2,
 | 
			
		||||
    # the multiplication factor for decreasing loss scale, defaults to 0.5
 | 
			
		||||
    backoff_factor=0.5,
 | 
			
		||||
    # the maximum loss scale, defaults to None
 | 
			
		||||
    max_scale=2**24,
 | 
			
		||||
    # the number of overflows before decreasing loss scale, defaults to 2
 | 
			
		||||
    hysteresis=2,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
hybrid_zero_optimizer = dict(
 | 
			
		||||
    # Enable low_level_optimzer overlap_communication
 | 
			
		||||
    overlap_sync_grad=True,
 | 
			
		||||
    overlap_sync_param=True,
 | 
			
		||||
    # bucket size for nccl communication params
 | 
			
		||||
    reduce_bucket_size=512 * 1024 * 1024,
 | 
			
		||||
    # grad clipping
 | 
			
		||||
    clip_grad_norm=1.0,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
loss = dict(
 | 
			
		||||
    label_smoothing=0,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
adam = dict(
 | 
			
		||||
    lr=1e-4,
 | 
			
		||||
    adam_beta1=0.9,
 | 
			
		||||
    adam_beta2=0.95,
 | 
			
		||||
    adam_beta2_c=0,
 | 
			
		||||
    adam_eps=1e-8,
 | 
			
		||||
    weight_decay=0.01,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
lr_scheduler = dict(
 | 
			
		||||
    total_steps=data["total_steps"],
 | 
			
		||||
    init_steps=0,  # optimizer_warmup_step
 | 
			
		||||
    warmup_ratio=0.01,
 | 
			
		||||
    eta_min=1e-5,
 | 
			
		||||
    last_epoch=-1,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
beta2_scheduler = dict(
 | 
			
		||||
    init_beta2=adam["adam_beta2"],
 | 
			
		||||
    c=adam["adam_beta2_c"],
 | 
			
		||||
    cur_iter=-1,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
model = dict(
 | 
			
		||||
    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
 | 
			
		||||
    num_attention_heads=NUM_ATTENTION_HEAD,
 | 
			
		||||
    embed_split_hidden=True,
 | 
			
		||||
    vocab_size=VOCAB_SIZE,
 | 
			
		||||
    embed_grad_scale=1,
 | 
			
		||||
    parallel_output=True,
 | 
			
		||||
    hidden_size=HIDDEN_SIZE,
 | 
			
		||||
    num_layers=NUM_LAYER,
 | 
			
		||||
    mlp_ratio=MLP_RATIO,
 | 
			
		||||
    apply_post_layer_norm=False,
 | 
			
		||||
    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
 | 
			
		||||
    norm_type="rmsnorm",
 | 
			
		||||
    layer_norm_epsilon=1e-5,
 | 
			
		||||
    use_flash_attn=True,
 | 
			
		||||
    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 | 
			
		||||
)
 | 
			
		||||
"""
 | 
			
		||||
zero1 parallel:
 | 
			
		||||
    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
 | 
			
		||||
        so parameters will be divided within the range of dp.
 | 
			
		||||
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
 | 
			
		||||
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
 | 
			
		||||
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
 | 
			
		||||
pipeline parallel (dict):
 | 
			
		||||
    1. size: int, the size of pipeline parallel.
 | 
			
		||||
    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
 | 
			
		||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
 | 
			
		||||
"""
 | 
			
		||||
parallel = dict(
 | 
			
		||||
    zero1=8,
 | 
			
		||||
    pipeline=dict(size=1, interleaved_overlap=True),
 | 
			
		||||
    sequence_parallel=False,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
cudnn_deterministic = False
 | 
			
		||||
cudnn_benchmark = False
 | 
			
		||||
 | 
			
		||||
monitor = dict(
 | 
			
		||||
    # feishu alert configs
 | 
			
		||||
    alert=dict(
 | 
			
		||||
        enable_feishu_alert=DO_ALERT,
 | 
			
		||||
        feishu_alert_address=None,  # feishu webhook to send alert message
 | 
			
		||||
        light_monitor_address=None,  # light_monitor address to send heartbeat
 | 
			
		||||
    ),
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
#### Data Configuration
 | 
			
		||||
Here are the key parameters and their explanations for data configuration:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 208 KiB  | 
							
								
								
									
										169
									
								
								doc/usage.md
								
								
								
								
							
							
						
						
									
										169
									
								
								doc/usage.md
								
								
								
								
							| 
						 | 
				
			
			@ -66,7 +66,174 @@ python tools/alpaca_tokenizer.py /path/to/alpaca_dataset /path/to/output_dataset
 | 
			
		|||
 | 
			
		||||
### 训练配置
 | 
			
		||||
 | 
			
		||||
以 7B Demo 的配置文件`configs/7B_sft.py`为例,介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。
 | 
			
		||||
以 7B Demo 的配置文件`configs/7B_sft.py`为例:
 | 
			
		||||
```python
 | 
			
		||||
JOB_NAME = "7b_train"
 | 
			
		||||
DO_ALERT = False
 | 
			
		||||
 | 
			
		||||
SEQ_LEN = 2048
 | 
			
		||||
HIDDEN_SIZE = 4096
 | 
			
		||||
NUM_ATTENTION_HEAD = 32
 | 
			
		||||
MLP_RATIO = 8 / 3
 | 
			
		||||
NUM_LAYER = 32
 | 
			
		||||
VOCAB_SIZE = 103168
 | 
			
		||||
 | 
			
		||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 | 
			
		||||
# Ckpt folder format:
 | 
			
		||||
# fs: 'local:/mnt/nfs/XXX'
 | 
			
		||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
 | 
			
		||||
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
 | 
			
		||||
 | 
			
		||||
# boto3 Ckpt folder format:
 | 
			
		||||
# import os
 | 
			
		||||
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
 | 
			
		||||
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
 | 
			
		||||
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
 | 
			
		||||
CHECKPOINT_EVERY = 50
 | 
			
		||||
ckpt = dict(
 | 
			
		||||
    enable_save_ckpt=False,  # enable ckpt save.
 | 
			
		||||
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
 | 
			
		||||
    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
 | 
			
		||||
    load_ckpt_folder="local:llm_ckpts/",
 | 
			
		||||
    # 'load_ckpt_info' setting guide:
 | 
			
		||||
    # 1. the 'path' indicate ckpt path,
 | 
			
		||||
    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
 | 
			
		||||
    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
 | 
			
		||||
    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
 | 
			
		||||
    checkpoint_every=CHECKPOINT_EVERY,
 | 
			
		||||
    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
 | 
			
		||||
    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
 | 
			
		||||
    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
TRAIN_FOLDER = "/path/to/dataset"
 | 
			
		||||
VALID_FOLDER = "/path/to/dataset"
 | 
			
		||||
data = dict(
 | 
			
		||||
    seq_len=SEQ_LEN,
 | 
			
		||||
    # micro_num means the number of micro_batch contained in one gradient update
 | 
			
		||||
    micro_num=4,
 | 
			
		||||
    # packed_length = micro_bsz * SEQ_LEN
 | 
			
		||||
    micro_bsz=2,
 | 
			
		||||
    # defaults to the value of micro_num
 | 
			
		||||
    valid_micro_num=4,
 | 
			
		||||
    # defaults to 0, means disable evaluate
 | 
			
		||||
    valid_every=50,
 | 
			
		||||
    pack_sample_into_one=False,
 | 
			
		||||
    total_steps=50000,
 | 
			
		||||
    skip_batches="",
 | 
			
		||||
    rampup_batch_size="",
 | 
			
		||||
    # Datasets with less than 50 rows will be discarded
 | 
			
		||||
    min_length=50,
 | 
			
		||||
    # train_folder=TRAIN_FOLDER,
 | 
			
		||||
    # valid_folder=VALID_FOLDER,
 | 
			
		||||
    empty_cache_and_diag_interval=10,
 | 
			
		||||
    diag_outlier_ratio=1.1,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
grad_scaler = dict(
 | 
			
		||||
    fp16=dict(
 | 
			
		||||
        # the initial loss scale, defaults to 2**16
 | 
			
		||||
        initial_scale=2**16,
 | 
			
		||||
        # the minimum loss scale, defaults to None
 | 
			
		||||
        min_scale=1,
 | 
			
		||||
        # the number of steps to increase loss scale when no overflow occurs
 | 
			
		||||
        growth_interval=1000,
 | 
			
		||||
    ),
 | 
			
		||||
    # the multiplication factor for increasing loss scale, defaults to 2
 | 
			
		||||
    growth_factor=2,
 | 
			
		||||
    # the multiplication factor for decreasing loss scale, defaults to 0.5
 | 
			
		||||
    backoff_factor=0.5,
 | 
			
		||||
    # the maximum loss scale, defaults to None
 | 
			
		||||
    max_scale=2**24,
 | 
			
		||||
    # the number of overflows before decreasing loss scale, defaults to 2
 | 
			
		||||
    hysteresis=2,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
hybrid_zero_optimizer = dict(
 | 
			
		||||
    # Enable low_level_optimzer overlap_communication
 | 
			
		||||
    overlap_sync_grad=True,
 | 
			
		||||
    overlap_sync_param=True,
 | 
			
		||||
    # bucket size for nccl communication params
 | 
			
		||||
    reduce_bucket_size=512 * 1024 * 1024,
 | 
			
		||||
    # grad clipping
 | 
			
		||||
    clip_grad_norm=1.0,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
loss = dict(
 | 
			
		||||
    label_smoothing=0,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
adam = dict(
 | 
			
		||||
    lr=1e-4,
 | 
			
		||||
    adam_beta1=0.9,
 | 
			
		||||
    adam_beta2=0.95,
 | 
			
		||||
    adam_beta2_c=0,
 | 
			
		||||
    adam_eps=1e-8,
 | 
			
		||||
    weight_decay=0.01,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
lr_scheduler = dict(
 | 
			
		||||
    total_steps=data["total_steps"],
 | 
			
		||||
    init_steps=0,  # optimizer_warmup_step
 | 
			
		||||
    warmup_ratio=0.01,
 | 
			
		||||
    eta_min=1e-5,
 | 
			
		||||
    last_epoch=-1,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
beta2_scheduler = dict(
 | 
			
		||||
    init_beta2=adam["adam_beta2"],
 | 
			
		||||
    c=adam["adam_beta2_c"],
 | 
			
		||||
    cur_iter=-1,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
model = dict(
 | 
			
		||||
    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
 | 
			
		||||
    num_attention_heads=NUM_ATTENTION_HEAD,
 | 
			
		||||
    embed_split_hidden=True,
 | 
			
		||||
    vocab_size=VOCAB_SIZE,
 | 
			
		||||
    embed_grad_scale=1,
 | 
			
		||||
    parallel_output=True,
 | 
			
		||||
    hidden_size=HIDDEN_SIZE,
 | 
			
		||||
    num_layers=NUM_LAYER,
 | 
			
		||||
    mlp_ratio=MLP_RATIO,
 | 
			
		||||
    apply_post_layer_norm=False,
 | 
			
		||||
    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
 | 
			
		||||
    norm_type="rmsnorm",
 | 
			
		||||
    layer_norm_epsilon=1e-5,
 | 
			
		||||
    use_flash_attn=True,
 | 
			
		||||
    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 | 
			
		||||
)
 | 
			
		||||
"""
 | 
			
		||||
zero1 parallel:
 | 
			
		||||
    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
 | 
			
		||||
        so parameters will be divided within the range of dp.
 | 
			
		||||
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
 | 
			
		||||
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
 | 
			
		||||
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
 | 
			
		||||
pipeline parallel (dict):
 | 
			
		||||
    1. size: int, the size of pipeline parallel.
 | 
			
		||||
    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
 | 
			
		||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
 | 
			
		||||
"""
 | 
			
		||||
parallel = dict(
 | 
			
		||||
    zero1=8,
 | 
			
		||||
    pipeline=dict(size=1, interleaved_overlap=True),
 | 
			
		||||
    sequence_parallel=False,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
cudnn_deterministic = False
 | 
			
		||||
cudnn_benchmark = False
 | 
			
		||||
 | 
			
		||||
monitor = dict(
 | 
			
		||||
    # feishu alert configs
 | 
			
		||||
    alert=dict(
 | 
			
		||||
        enable_feishu_alert=DO_ALERT,
 | 
			
		||||
        feishu_alert_address=None,  # feishu webhook to send alert message
 | 
			
		||||
        light_monitor_address=None,  # light_monitor address to send heartbeat
 | 
			
		||||
    ),
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。
 | 
			
		||||
 | 
			
		||||
#### 数据配置
 | 
			
		||||
数据相关的关键参数配置及释义如下所示:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,6 +4,7 @@
 | 
			
		|||
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
from collections import deque
 | 
			
		||||
from typing import Iterable, Optional
 | 
			
		||||
 | 
			
		||||
from internlm.core.engine import Engine
 | 
			
		||||
| 
						 | 
				
			
			@ -58,6 +59,24 @@ class TrainState:
 | 
			
		|||
        if batch_sampler:
 | 
			
		||||
            self.init_batch_sampler(batch_sampler)
 | 
			
		||||
 | 
			
		||||
        # tgs statistic
 | 
			
		||||
        self.tgs_statistic = {
 | 
			
		||||
            "sum_step": 0,
 | 
			
		||||
            "sum_tg": 0,
 | 
			
		||||
            "sum_time": 0,
 | 
			
		||||
            "sum_last_tg_10": 0,
 | 
			
		||||
            "sum_last_time_10": 0,
 | 
			
		||||
            "sum_last_tg_50": 0,
 | 
			
		||||
            "sum_last_time_50": 0,
 | 
			
		||||
            "SMA_tg_50": 0,
 | 
			
		||||
            "SMA_time_50": 0,
 | 
			
		||||
            "SMA_tg_50_list": deque(),
 | 
			
		||||
            "SMA_time_50_list": deque(),
 | 
			
		||||
            "sum_tgs": 0,
 | 
			
		||||
            "last_tgs_10": 0,
 | 
			
		||||
            "last_tgs_50": 0,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    def init_batch_sampler(self, batch_sampler):
 | 
			
		||||
        """
 | 
			
		||||
        Args:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -379,9 +379,52 @@ def record_current_batch_training_metrics(
 | 
			
		|||
        max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
 | 
			
		||||
        max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
 | 
			
		||||
        min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
 | 
			
		||||
 | 
			
		||||
        tk_per_gpu = 0
 | 
			
		||||
        time_cost = time.time() - start_time
 | 
			
		||||
        tk_per_gpu = round(
 | 
			
		||||
            num_tokens_in_batch * gpc.get_world_size(ParallelMode.DATA) / gpc.get_world_size(ParallelMode.GLOBAL),
 | 
			
		||||
            4,
 | 
			
		||||
        )
 | 
			
		||||
        tgs_statistic = train_state.tgs_statistic
 | 
			
		||||
        tgs_statistic["sum_step"] += 1
 | 
			
		||||
        tgs_statistic["sum_tg"] += tk_per_gpu
 | 
			
		||||
        tgs_statistic["sum_time"] += time_cost
 | 
			
		||||
        tgs_statistic["sum_last_tg_10"] += tk_per_gpu
 | 
			
		||||
        tgs_statistic["sum_last_time_10"] += time_cost
 | 
			
		||||
        tgs_statistic["sum_last_tg_50"] += tk_per_gpu
 | 
			
		||||
        tgs_statistic["sum_last_time_50"] += time_cost
 | 
			
		||||
        tgs_statistic["SMA_tg_50"] += tk_per_gpu
 | 
			
		||||
        tgs_statistic["SMA_time_50"] += time_cost
 | 
			
		||||
        tgs_statistic["SMA_tg_50_list"].append(tk_per_gpu)
 | 
			
		||||
        tgs_statistic["SMA_time_50_list"].append(time_cost)
 | 
			
		||||
        if tgs_statistic["sum_step"] > 50:
 | 
			
		||||
            tgs_statistic["SMA_tg_50"] -= tgs_statistic["SMA_tg_50_list"][0]
 | 
			
		||||
            tgs_statistic["SMA_time_50"] -= tgs_statistic["SMA_time_50_list"][0]
 | 
			
		||||
            tgs_statistic["SMA_tg_50_list"].popleft()
 | 
			
		||||
            tgs_statistic["SMA_time_50_list"].popleft()
 | 
			
		||||
 | 
			
		||||
        last_tgs_1 = round(tk_per_gpu / time_cost, 2)
 | 
			
		||||
        tgs_statistic["sum_tgs"] += last_tgs_1
 | 
			
		||||
 | 
			
		||||
        if tgs_statistic["sum_step"] % 10 == 0:
 | 
			
		||||
            tgs_statistic["last_tgs_10"] = round(tgs_statistic["sum_last_tg_10"] / tgs_statistic["sum_last_time_10"], 2)
 | 
			
		||||
            tgs_statistic["sum_last_tg_10"] = 0
 | 
			
		||||
            tgs_statistic["sum_last_time_10"] = 0
 | 
			
		||||
 | 
			
		||||
        if tgs_statistic["sum_step"] % 50 == 0:
 | 
			
		||||
            tgs_statistic["last_tgs_50"] = round(tgs_statistic["sum_last_tg_50"] / tgs_statistic["sum_last_time_50"], 2)
 | 
			
		||||
            tgs_statistic["sum_last_tg_50"] = 0
 | 
			
		||||
            tgs_statistic["sum_last_time_50"] = 0
 | 
			
		||||
 | 
			
		||||
        last_tgs_10 = tgs_statistic["last_tgs_10"]
 | 
			
		||||
        last_tgs_50 = tgs_statistic["last_tgs_50"]
 | 
			
		||||
 | 
			
		||||
        tgs_all = round(tgs_statistic["sum_tg"] / tgs_statistic["sum_time"], 2)
 | 
			
		||||
        tgs_avg = round(tgs_statistic["sum_tgs"] / tgs_statistic["sum_step"], 2)
 | 
			
		||||
        tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2)
 | 
			
		||||
 | 
			
		||||
        tflops = get_tflops_func((time.time() - start_time))
 | 
			
		||||
 | 
			
		||||
        tgs_origin = round(
 | 
			
		||||
            num_tokens_in_batch
 | 
			
		||||
            * gpc.get_world_size(ParallelMode.DATA)
 | 
			
		||||
            / gpc.get_world_size(ParallelMode.GLOBAL)
 | 
			
		||||
| 
						 | 
				
			
			@ -389,14 +432,18 @@ def record_current_batch_training_metrics(
 | 
			
		|||
            2,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        tflops = get_tflops_func((time.time() - start_time))
 | 
			
		||||
 | 
			
		||||
        infos = {
 | 
			
		||||
            "tflops": tflops,
 | 
			
		||||
            "step": batch_count,
 | 
			
		||||
            "loss": loss.item() - moe_loss.item(),
 | 
			
		||||
            "moe_loss": moe_loss.item(),
 | 
			
		||||
            "tgs (tokens/gpu/second)": tk_per_gpu,
 | 
			
		||||
            "tgs (tokens/gpu/second)": tgs_origin,
 | 
			
		||||
            "tgs/last_tgs_1": last_tgs_1,
 | 
			
		||||
            "tgs/tgs_all": tgs_all,
 | 
			
		||||
            "tgs/tgs_avg": tgs_avg,
 | 
			
		||||
            "tgs/tgs_SMA": tgs_SMA,
 | 
			
		||||
            "tgs/last_tgs_10": last_tgs_10,
 | 
			
		||||
            "tgs/last_tgs_50": last_tgs_50,
 | 
			
		||||
            "lr": lr,
 | 
			
		||||
            "loss_scale": scaler,
 | 
			
		||||
            "grad_norm": grad_norm,
 | 
			
		||||
| 
						 | 
				
			
			@ -436,7 +483,7 @@ def record_current_batch_training_metrics(
 | 
			
		|||
                "num_consumed_tokens": train_state.num_consumed_tokens,
 | 
			
		||||
                "loss": loss.item() - moe_loss.item(),
 | 
			
		||||
                "flops": tflops,
 | 
			
		||||
                "tgs": tk_per_gpu,
 | 
			
		||||
                "tgs": last_tgs_1,
 | 
			
		||||
                "acc": acc_perplex["acc"],
 | 
			
		||||
                "perplexity": acc_perplex["perplexity"],
 | 
			
		||||
                "fwd_bwd_time": fwd_bwd_time,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -541,8 +541,8 @@ class CheckpointManager:
 | 
			
		|||
 | 
			
		||||
        Args:
 | 
			
		||||
            ckpt_config (dict): model checkpoint config.
 | 
			
		||||
            model (nn.module): model obj
 | 
			
		||||
            optimizer (object): optimzier obj.
 | 
			
		||||
            model (nn.module): model obj.
 | 
			
		||||
            optimizer (object): optimizer obj.
 | 
			
		||||
            lr_scheduler (object): lr_scheduler obj.
 | 
			
		||||
            model_config (dict): model config.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -806,7 +806,6 @@ now step_count is {train_state.step_count}",
 | 
			
		|||
        return dict(path=latest_ckpt, content=("all",), ckpt_type="internlm")
 | 
			
		||||
 | 
			
		||||
    def try_resume_training(self, train_state: TrainState, current_time=""):
 | 
			
		||||
 | 
			
		||||
        if self.load_ckpt_info is None or self.load_ckpt_info["path"] is None:
 | 
			
		||||
            if gpc.is_rank_for_log():
 | 
			
		||||
                logger.info(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,65 @@
 | 
			
		|||
import multiprocessing as mp
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
from internlm.model.embedding import Embedding1D
 | 
			
		||||
from tests.test_model.test_model_internlm import build_environment, seed_all
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_embedding(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size = args
 | 
			
		||||
    device = torch.device("cuda")
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
    rtol, atol = (1e-3, 5e-3)
 | 
			
		||||
    vocab_size = 4
 | 
			
		||||
    hidden_size = 2
 | 
			
		||||
 | 
			
		||||
    # fix seed
 | 
			
		||||
    seed_all(1024)
 | 
			
		||||
 | 
			
		||||
    # define embedding
 | 
			
		||||
    embedding = Embedding1D(
 | 
			
		||||
        num_embeddings=vocab_size,
 | 
			
		||||
        embedding_dim=hidden_size,
 | 
			
		||||
        padding_idx=None,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    embedding.weight.data.copy_(torch.randn(vocab_size, hidden_size))
 | 
			
		||||
    embedding = embedding.to(device)
 | 
			
		||||
 | 
			
		||||
    # create input
 | 
			
		||||
    input_ids = torch.tensor([[0, 2], [1, 3]]).to(device)
 | 
			
		||||
    result = embedding(input_ids)
 | 
			
		||||
 | 
			
		||||
    standard_list = [[[-1.4837, 0.2671], [0.6002, -0.5496]], [[-1.8337, -0.1047], [1.0391, 0.2261]]]
 | 
			
		||||
    standard_result = torch.tensor(standard_list).to(device)
 | 
			
		||||
 | 
			
		||||
    # check output
 | 
			
		||||
    assert torch.allclose(result, standard_result, rtol=rtol, atol=atol, equal_nan=True)
 | 
			
		||||
 | 
			
		||||
    loss = torch.randn_like(result)
 | 
			
		||||
 | 
			
		||||
    # backward
 | 
			
		||||
    result.backward(loss)
 | 
			
		||||
 | 
			
		||||
    grad = embedding.weight.grad
 | 
			
		||||
    standard_glist = [[-0.4461, 0.5602], [0.4353, 1.2988], [-0.0625, -1.3609], [0.9595, -0.1144]]
 | 
			
		||||
    standard_grad = torch.tensor(standard_glist).to(device)
 | 
			
		||||
 | 
			
		||||
    # check grad
 | 
			
		||||
    assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol, equal_nan=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.embedding
 | 
			
		||||
def test_embedding():
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(check_embedding, [[rank, 8] for rank in range(8)])
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    pytest.main(["-s", "-q", "test_embedding.py"])
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,379 @@
 | 
			
		|||
import multiprocessing as mp
 | 
			
		||||
import random
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
import pytest
 | 
			
		||||
import torch
 | 
			
		||||
from torch import nn
 | 
			
		||||
 | 
			
		||||
import internlm
 | 
			
		||||
from internlm.core.context import ParallelMode
 | 
			
		||||
from internlm.core.context.parallel_context import Config
 | 
			
		||||
from internlm.core.context.parallel_context import global_context as gpc
 | 
			
		||||
from internlm.model.linear import RewardModelLinear, ScaleColumnParallelLinear
 | 
			
		||||
from internlm.model.modeling_internlm import PackedFlashBaseLayer1D
 | 
			
		||||
from internlm.model.utils import gather_forward_split_backward
 | 
			
		||||
 | 
			
		||||
config = Config(
 | 
			
		||||
    dict(
 | 
			
		||||
        parallel=dict(zero1=1, pipeline=dict(size=1, interleaved_overlap=False), sequence_parallel=False, tensor=1),
 | 
			
		||||
        model_type="INTERNLM",
 | 
			
		||||
        data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999),
 | 
			
		||||
        model=dict(
 | 
			
		||||
            checkpoint=False,
 | 
			
		||||
            num_attention_heads=2,
 | 
			
		||||
            embed_split_hidden=True,
 | 
			
		||||
            vocab_size=103168,
 | 
			
		||||
            embed_grad_scale=1,
 | 
			
		||||
            parallel_output=True,
 | 
			
		||||
            hidden_size=1024,
 | 
			
		||||
            num_layers=2,
 | 
			
		||||
            mlp_ratio=1,
 | 
			
		||||
            apply_post_layer_norm=False,
 | 
			
		||||
            dtype=torch.bfloat16,
 | 
			
		||||
            norm_type="rmsnorm",
 | 
			
		||||
            layer_norm_epsilon=1e-5,
 | 
			
		||||
            use_flash_attn=True,
 | 
			
		||||
            num_chunks=1,
 | 
			
		||||
        ),
 | 
			
		||||
        resume_tb_folder="",
 | 
			
		||||
        tensorboard_folder="",
 | 
			
		||||
        alert_address=None,
 | 
			
		||||
        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
 | 
			
		||||
    )
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_environment(rank, world_size):
 | 
			
		||||
    import os
 | 
			
		||||
 | 
			
		||||
    os.environ["RANK"] = str(rank)
 | 
			
		||||
    os.environ["LOCAL_RANK"] = str(rank)
 | 
			
		||||
    os.environ["WORLD_SIZE"] = str(world_size)
 | 
			
		||||
    os.environ["MASTER_ADDR"] = "127.0.0.1"
 | 
			
		||||
    os.environ["MASTER_PORT"] = "12345"
 | 
			
		||||
    torch.cuda.empty_cache()
 | 
			
		||||
    # launcher="torch"
 | 
			
		||||
    internlm.launch_from_torch(config=config, seed=1024)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def seed_all(seed, cuda_deterministic=False):
 | 
			
		||||
    random.seed(seed)
 | 
			
		||||
    np.random.seed(seed)
 | 
			
		||||
    torch.manual_seed(seed)
 | 
			
		||||
    if torch.cuda.is_available():
 | 
			
		||||
        torch.cuda.manual_seed(seed)
 | 
			
		||||
        torch.cuda.manual_seed_all(seed)
 | 
			
		||||
    if cuda_deterministic:  # slower, more reproducible
 | 
			
		||||
        torch.backends.cudnn.deterministic = True
 | 
			
		||||
        torch.backends.cudnn.benchmark = False
 | 
			
		||||
    else:
 | 
			
		||||
        torch.backends.cudnn.deterministic = False
 | 
			
		||||
        torch.backends.cudnn.benchmark = True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_block(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size = args
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
    device = torch.device("cuda")
 | 
			
		||||
    rtol, atol = (1e-3, 5e-3)
 | 
			
		||||
 | 
			
		||||
    # fix seed
 | 
			
		||||
    seed_all(1024)
 | 
			
		||||
 | 
			
		||||
    # define block
 | 
			
		||||
    blocks = nn.ModuleList(
 | 
			
		||||
        [
 | 
			
		||||
            PackedFlashBaseLayer1D(
 | 
			
		||||
                hidden_size=4,  # 768
 | 
			
		||||
                num_attention_heads=2,  # 12
 | 
			
		||||
                mlp_ratio=2,
 | 
			
		||||
                attn_drop_rate=0.0,
 | 
			
		||||
                drop_rate=0.0,
 | 
			
		||||
                dtype=torch.bfloat16,
 | 
			
		||||
                layer_norm_epsilon=1e-5,
 | 
			
		||||
                checkpoint=lid < 0,
 | 
			
		||||
                layer_idx=lid + 0,  # This parameter is used for caching during generation
 | 
			
		||||
                residual_in_fp32=False,
 | 
			
		||||
                device=device,
 | 
			
		||||
                norm_type="rmsnorm",
 | 
			
		||||
                dropout_selective_checkpoint=True,
 | 
			
		||||
                use_scaled_init=True,
 | 
			
		||||
                use_swiglu=True,
 | 
			
		||||
            )
 | 
			
		||||
            for lid in range(4)  # 32
 | 
			
		||||
        ]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # create input
 | 
			
		||||
    cu_seqlens = torch.tensor([0, 2, 4], dtype=torch.int32).to(device)  # [0, 8, 16]
 | 
			
		||||
    indexes = torch.tensor([0, 1, 0, 1]).to(device)  # [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]
 | 
			
		||||
    hidden_states = torch.tensor([[0, 3, 2, 1]]).to(device)  # [[4, 118, 0, 1, 2, 3, 0, 1, 1, 97, 0, 0, 0, 0, 0, 0]]
 | 
			
		||||
    max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
 | 
			
		||||
 | 
			
		||||
    hidden_states = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [
 | 
			
		||||
                [-1.1620, 1.3113, 0.1507, 2.2698],
 | 
			
		||||
                [-1.2610, 1.0990, 0.3787, -0.3478],
 | 
			
		||||
                [1.4001, 1.1982, -0.6696, 0.3269],
 | 
			
		||||
                [1.3304, 1.2262, 1.0735, -1.1169],
 | 
			
		||||
            ]
 | 
			
		||||
        ]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    hidden_states = hidden_states.squeeze(0).to(device).requires_grad_()
 | 
			
		||||
 | 
			
		||||
    # forward
 | 
			
		||||
    for _, block in enumerate(blocks):
 | 
			
		||||
        block = block.to(torch.bfloat16)
 | 
			
		||||
        block = block.to(device)
 | 
			
		||||
        hidden_states = block(
 | 
			
		||||
            hidden_states,
 | 
			
		||||
            cu_seqlens=cu_seqlens,
 | 
			
		||||
            indexes=indexes,
 | 
			
		||||
            inference_params=None,
 | 
			
		||||
            max_seqlen=max_seqlen,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    result = hidden_states
 | 
			
		||||
    standard_result = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [-1.1621, 1.3111, 0.1509, 2.2697],
 | 
			
		||||
            [-1.2611, 1.0988, 0.3787, -0.3478],
 | 
			
		||||
            [1.4000, 1.1982, -0.6694, 0.3268],
 | 
			
		||||
            [1.3303, 1.2262, 1.0736, -1.1169],
 | 
			
		||||
        ]
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # check output
 | 
			
		||||
    assert torch.allclose(result, standard_result, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
    hidden_states.retain_grad()
 | 
			
		||||
    loss = torch.randn_like(result)
 | 
			
		||||
 | 
			
		||||
    # backward
 | 
			
		||||
    result.backward(loss)
 | 
			
		||||
 | 
			
		||||
    grad = hidden_states.grad
 | 
			
		||||
    standard_grad = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [0.7999, -0.2595, 0.2649, -1.3256],
 | 
			
		||||
            [0.7064, 0.0283, -0.5508, 0.6494],
 | 
			
		||||
            [-1.4657, -2.0316, 1.3776, 0.7211],
 | 
			
		||||
            [-0.6046, 0.4329, -0.1884, 1.1170],
 | 
			
		||||
        ]
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # check grad
 | 
			
		||||
    assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_head(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size, is_reward = args
 | 
			
		||||
    device = torch.device("cuda")
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
    rtol, atol = (1e-3, 5e-3)
 | 
			
		||||
    hidden_size = 4
 | 
			
		||||
    vocab_size = 4
 | 
			
		||||
    embed_grad_scale = 1
 | 
			
		||||
 | 
			
		||||
    # fix seed
 | 
			
		||||
    seed_all(1024)
 | 
			
		||||
 | 
			
		||||
    # load standard
 | 
			
		||||
    if is_reward:
 | 
			
		||||
        head_cls = RewardModelLinear
 | 
			
		||||
        standard_result = torch.tensor([[3.5938], [1.0703], [3.6250], [3.6250]], dtype=torch.bfloat16).to(device)
 | 
			
		||||
        standard_grad = torch.tensor(
 | 
			
		||||
            [
 | 
			
		||||
                [-0.2246, 0.0164, -0.0591, 0.1660],
 | 
			
		||||
                [-0.5625, 0.0408, -0.1484, 0.4160],
 | 
			
		||||
                [-0.1758, 0.0128, -0.0464, 0.1299],
 | 
			
		||||
                [-0.4785, 0.0347, -0.1260, 0.3516],
 | 
			
		||||
            ],
 | 
			
		||||
            dtype=torch.bfloat16,
 | 
			
		||||
        ).to(device)
 | 
			
		||||
    else:
 | 
			
		||||
        head_cls = ScaleColumnParallelLinear
 | 
			
		||||
        standard_result = torch.tensor(
 | 
			
		||||
            [
 | 
			
		||||
                [3.5938, -2.2188, 2.0312, 3.5625],
 | 
			
		||||
                [1.0703, -1.1797, 1.1406, 1.6641],
 | 
			
		||||
                [3.6250, -2.0156, 1.7656, 3.4531],
 | 
			
		||||
                [3.6250, -2.0156, 1.7656, 3.4531],
 | 
			
		||||
            ],
 | 
			
		||||
            dtype=torch.bfloat16,
 | 
			
		||||
        ).to(device)
 | 
			
		||||
        standard_grad = torch.tensor(
 | 
			
		||||
            [
 | 
			
		||||
                [-0.2354, 0.0981, -0.2930, -0.6328],
 | 
			
		||||
                [0.2344, -0.2334, -0.0918, 0.1396],
 | 
			
		||||
                [-0.5898, -1.0156, -0.7070, 1.3750],
 | 
			
		||||
                [0.0242, -0.1494, 0.1206, -0.0427],
 | 
			
		||||
            ],
 | 
			
		||||
            dtype=torch.bfloat16,
 | 
			
		||||
        ).to(device)
 | 
			
		||||
 | 
			
		||||
    # define head
 | 
			
		||||
    head = head_cls(
 | 
			
		||||
        in_features=hidden_size,
 | 
			
		||||
        out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
 | 
			
		||||
        process_group=gpc.get_group(ParallelMode.TENSOR),
 | 
			
		||||
        bias=False,
 | 
			
		||||
        device=device,
 | 
			
		||||
        dtype=torch.bfloat16,
 | 
			
		||||
        weight_scale=embed_grad_scale,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    head = head.to(torch.bfloat16)
 | 
			
		||||
    head = head.to(device)
 | 
			
		||||
 | 
			
		||||
    # create input
 | 
			
		||||
    hidden_states = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [8.3726, 1.9245, 5.5101, 1.0000],
 | 
			
		||||
            [3.3474, 2.9582, 1.0000, 1.0000],
 | 
			
		||||
            [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
            [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
        ],
 | 
			
		||||
        dtype=torch.bfloat16,
 | 
			
		||||
        requires_grad=True,
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # forward
 | 
			
		||||
    result = head(hidden_states)
 | 
			
		||||
 | 
			
		||||
    # check output
 | 
			
		||||
    assert torch.allclose(result, standard_result, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
    hidden_states.retain_grad()
 | 
			
		||||
    loss = torch.randn_like(result)
 | 
			
		||||
 | 
			
		||||
    # backward
 | 
			
		||||
    result.backward(loss)
 | 
			
		||||
    grad = hidden_states.grad
 | 
			
		||||
 | 
			
		||||
    # check grad
 | 
			
		||||
    assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_gather_forward(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size, parallel_tensor = args
 | 
			
		||||
    assert parallel_tensor in [1, 2]
 | 
			
		||||
    config.parallel.tensor = parallel_tensor
 | 
			
		||||
    device = torch.device("cuda")
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
    rtol, atol = (1e-3, 5e-3)
 | 
			
		||||
 | 
			
		||||
    # fix seed
 | 
			
		||||
    seed_all(1024)
 | 
			
		||||
 | 
			
		||||
    # load standard
 | 
			
		||||
    if parallel_tensor == 1:
 | 
			
		||||
        standard_result = torch.tensor(
 | 
			
		||||
            [
 | 
			
		||||
                [8.3726, 1.9245, 5.5101, 1.0000],
 | 
			
		||||
                [3.3474, 2.9582, 1.0000, 1.0000],
 | 
			
		||||
                [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
                [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
            ]
 | 
			
		||||
        ).to(device)
 | 
			
		||||
        standard_grad = torch.tensor(
 | 
			
		||||
            [
 | 
			
		||||
                [-0.4461, 0.5602, -0.0625, -1.3609],
 | 
			
		||||
                [0.4353, 1.2988, 0.9595, -0.1144],
 | 
			
		||||
                [-0.7593, -0.4031, 0.2041, 1.4955],
 | 
			
		||||
                [0.5706, 0.9047, -0.6965, -0.3757],
 | 
			
		||||
            ]
 | 
			
		||||
        ).to(device)
 | 
			
		||||
    else:
 | 
			
		||||
        standard_result = torch.tensor(
 | 
			
		||||
            [
 | 
			
		||||
                [8.3726, 1.9245, 5.5101, 1.0000, 8.3726, 1.9245, 5.5101, 1.0000],
 | 
			
		||||
                [3.3474, 2.9582, 1.0000, 1.0000, 3.3474, 2.9582, 1.0000, 1.0000],
 | 
			
		||||
                [8.3726, 1.2875, 5.5101, 1.0000, 8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
                [8.3726, 1.2875, 5.5101, 1.0000, 8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
            ]
 | 
			
		||||
        ).to(device)
 | 
			
		||||
        if rank % 2 == 0:
 | 
			
		||||
            standard_grad = torch.tensor(
 | 
			
		||||
                [
 | 
			
		||||
                    [-0.4461, 0.5602, -0.0625, -1.3609],
 | 
			
		||||
                    [-0.7593, -0.4031, 0.2041, 1.4955],
 | 
			
		||||
                    [0.8093, 1.7580, 1.2996, -0.7545],
 | 
			
		||||
                    [1.0474, -0.5767, -1.0401, 0.8233],
 | 
			
		||||
                ]
 | 
			
		||||
            ).to(device)
 | 
			
		||||
        else:
 | 
			
		||||
            standard_grad = torch.tensor(
 | 
			
		||||
                [
 | 
			
		||||
                    [0.4353, 1.2988, 0.9595, -0.1144],
 | 
			
		||||
                    [0.5706, 0.9047, -0.6965, -0.3757],
 | 
			
		||||
                    [-1.3589, -0.7202, 0.6094, -0.8208],
 | 
			
		||||
                    [-1.0042, 0.3695, 0.2511, -0.2718],
 | 
			
		||||
                ]
 | 
			
		||||
            ).to(device)
 | 
			
		||||
 | 
			
		||||
    # create input
 | 
			
		||||
    hidden_states = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [8.3726, 1.9245, 5.5101, 1.0000],
 | 
			
		||||
            [3.3474, 2.9582, 1.0000, 1.0000],
 | 
			
		||||
            [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
            [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
        ],
 | 
			
		||||
        requires_grad=True,
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # forward
 | 
			
		||||
    result = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
 | 
			
		||||
 | 
			
		||||
    # check output
 | 
			
		||||
    assert torch.allclose(result, standard_result, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
    loss = torch.randn_like(result)
 | 
			
		||||
    hidden_states.retain_grad()
 | 
			
		||||
 | 
			
		||||
    # backward
 | 
			
		||||
    result.backward(loss)
 | 
			
		||||
    grad = hidden_states.grad
 | 
			
		||||
 | 
			
		||||
    # check grad
 | 
			
		||||
    assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.block
 | 
			
		||||
def test_block():
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(check_block, [[rank, 8] for rank in range(8)])
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.head
 | 
			
		||||
@pytest.mark.parametrize("is_reward", [True, False])
 | 
			
		||||
def test_head(is_reward):
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(check_head, [[rank, 8, is_reward] for rank in range(8)])
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.gather_forward
 | 
			
		||||
@pytest.mark.parametrize("parallel_tensor", [1, 2])
 | 
			
		||||
def test_gather_forward(parallel_tensor):
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(check_gather_forward, [[rank, 8, parallel_tensor] for rank in range(8)])
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    pytest.main(["-s", "-q", "test_model_internlm.py"])
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,84 @@
 | 
			
		|||
import multiprocessing as mp
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
from internlm.model.utils import try_import_RMSNorm
 | 
			
		||||
from tests.test_model.test_model_internlm import build_environment, seed_all
 | 
			
		||||
 | 
			
		||||
RMSNorm = try_import_RMSNorm()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_norm(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size = args
 | 
			
		||||
    device = torch.device("cuda")
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
    rtol, atol = (1e-3, 5e-3)
 | 
			
		||||
    hidden_size = 4
 | 
			
		||||
    layer_norm_epsilon = 1e-05
 | 
			
		||||
 | 
			
		||||
    # fix seed
 | 
			
		||||
    seed_all(1024)
 | 
			
		||||
 | 
			
		||||
    # define norm
 | 
			
		||||
    norm = RMSNorm(hidden_size, eps=layer_norm_epsilon)
 | 
			
		||||
    norm = norm.to(device)
 | 
			
		||||
 | 
			
		||||
    # create input
 | 
			
		||||
    hidden_states = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [8.3726, 1.9245, 5.5101, 1.0000],
 | 
			
		||||
            [3.3474, 2.9582, 1.0000, 1.0000],
 | 
			
		||||
            [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
            [8.3726, 1.2875, 5.5101, 1.0000],
 | 
			
		||||
        ],
 | 
			
		||||
        requires_grad=True,
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # forward
 | 
			
		||||
    result = norm(hidden_states.float())
 | 
			
		||||
 | 
			
		||||
    standard = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [1.6329, 0.3753, 1.0746, 0.1950],
 | 
			
		||||
            [1.4288, 1.2626, 0.4268, 0.4268],
 | 
			
		||||
            [1.6490, 0.2536, 1.0852, 0.1970],
 | 
			
		||||
            [1.6490, 0.2536, 1.0852, 0.1970],
 | 
			
		||||
        ]
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # check output
 | 
			
		||||
    assert torch.allclose(result, standard, rtol=rtol, atol=atol, equal_nan=True)
 | 
			
		||||
 | 
			
		||||
    hidden_states.retain_grad()
 | 
			
		||||
    loss = torch.randn_like(result)
 | 
			
		||||
 | 
			
		||||
    # backward
 | 
			
		||||
    result.backward(loss)
 | 
			
		||||
    grad = hidden_states.grad
 | 
			
		||||
 | 
			
		||||
    standard_grad = torch.tensor(
 | 
			
		||||
        [
 | 
			
		||||
            [-0.0193, 0.1248, 0.0324, -0.2573],
 | 
			
		||||
            [-0.2140, 0.2010, 0.2901, -0.1683],
 | 
			
		||||
            [-0.0815, -0.0689, 0.0850, 0.3027],
 | 
			
		||||
            [0.0847, 0.1739, -0.1554, -0.0773],
 | 
			
		||||
        ]
 | 
			
		||||
    ).to(device)
 | 
			
		||||
 | 
			
		||||
    # check grad
 | 
			
		||||
    assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol, equal_nan=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.norm
 | 
			
		||||
def test_norm():
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(check_norm, [[rank, 8] for rank in range(8)])
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    pytest.main(["-s", "-q", "test_norm.py"])
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,364 @@
 | 
			
		|||
import copy
 | 
			
		||||
import multiprocessing as mp
 | 
			
		||||
import random
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
import pytest
 | 
			
		||||
import torch
 | 
			
		||||
from torch import nn
 | 
			
		||||
from torch.nn.parallel import DistributedDataParallel as DDP
 | 
			
		||||
from torch.testing import assert_close
 | 
			
		||||
 | 
			
		||||
import internlm
 | 
			
		||||
from internlm.core.context.parallel_context import Config
 | 
			
		||||
from internlm.solver.optimizer import HybridZeroOptimizer
 | 
			
		||||
from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MlpModel(nn.Module):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.linear1 = nn.Linear(128, 256)
 | 
			
		||||
        self.linear2 = nn.Linear(256, 512)
 | 
			
		||||
 | 
			
		||||
    def forward(self, x):
 | 
			
		||||
        x = self.linear1(x)
 | 
			
		||||
        x = self.linear2(x)
 | 
			
		||||
        return x
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
config = Config(
 | 
			
		||||
    dict(
 | 
			
		||||
        parallel=dict(zero1=1, pipeline=dict(size=1, interleaved_overlap=False), sequence_parallel=False, tensor=1),
 | 
			
		||||
        model_type="INTERNLM",
 | 
			
		||||
        data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999),
 | 
			
		||||
        model=dict(
 | 
			
		||||
            dtype=torch.bfloat16,
 | 
			
		||||
        ),
 | 
			
		||||
        resume_tb_folder="",
 | 
			
		||||
        tensorboard_folder="",
 | 
			
		||||
        alert_address=None,
 | 
			
		||||
        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
 | 
			
		||||
        grad_scaler=dict(
 | 
			
		||||
            fp16=dict(
 | 
			
		||||
                initial_scale=1,
 | 
			
		||||
                min_scale=1,
 | 
			
		||||
                growth_interval=1,
 | 
			
		||||
            ),
 | 
			
		||||
            growth_factor=1.1,
 | 
			
		||||
            backoff_factor=0.9,
 | 
			
		||||
            max_scale=1,
 | 
			
		||||
            hysteresis=1,
 | 
			
		||||
        ),
 | 
			
		||||
        adam=dict(
 | 
			
		||||
            lr=1e-4,
 | 
			
		||||
            adam_beta1=0.9,
 | 
			
		||||
            adam_beta2=0.95,
 | 
			
		||||
            adam_beta2_c=0,
 | 
			
		||||
            adam_eps=1e-8,
 | 
			
		||||
            weight_decay=0.01,
 | 
			
		||||
        ),
 | 
			
		||||
        hybrid_zero_optimizer=dict(
 | 
			
		||||
            overlap_sync_grad=False,
 | 
			
		||||
            overlap_sync_param=False,
 | 
			
		||||
            reduce_bucket_size=512 * 1024 * 1024,
 | 
			
		||||
            clip_grad_norm=1.0,
 | 
			
		||||
        ),
 | 
			
		||||
    )
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def build_environment(rank, world_size):
 | 
			
		||||
    import os
 | 
			
		||||
 | 
			
		||||
    os.environ["RANK"] = str(rank)
 | 
			
		||||
    os.environ["LOCAL_RANK"] = str(rank)
 | 
			
		||||
    os.environ["WORLD_SIZE"] = str(world_size)
 | 
			
		||||
    os.environ["MASTER_ADDR"] = "127.0.0.1"
 | 
			
		||||
    os.environ["MASTER_PORT"] = "12345"
 | 
			
		||||
    torch.cuda.empty_cache()
 | 
			
		||||
    # launcher="torch"
 | 
			
		||||
    internlm.launch_from_torch(config=config, seed=1024)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def loose_close(a, b, dtype: torch.dtype = torch.float32):
 | 
			
		||||
 | 
			
		||||
    if dtype is torch.float32:
 | 
			
		||||
        rtol = 1.3e-6
 | 
			
		||||
        atol = 1e-5
 | 
			
		||||
    elif dtype is torch.bfloat16:
 | 
			
		||||
        rtol = 2e-2
 | 
			
		||||
        atol = 2e-2
 | 
			
		||||
 | 
			
		||||
    if isinstance(a, torch.Tensor):
 | 
			
		||||
        a = a.detach().to(dtype)
 | 
			
		||||
        b = b.detach().to(dtype)
 | 
			
		||||
 | 
			
		||||
    assert_close(a, b, rtol=rtol, atol=atol)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def init_optimizer_grouped_parameters(check_group, model):
 | 
			
		||||
    if check_group:
 | 
			
		||||
        optimizer_grouped_parameters = [
 | 
			
		||||
            {
 | 
			
		||||
                "params": list(model.parameters())[:2],
 | 
			
		||||
                "weight_decay": config.adam.weight_decay,
 | 
			
		||||
            },
 | 
			
		||||
            {
 | 
			
		||||
                "params": list(model.parameters())[2:],
 | 
			
		||||
                "weight_decay": config.adam.weight_decay,
 | 
			
		||||
            },
 | 
			
		||||
        ]
 | 
			
		||||
    else:
 | 
			
		||||
        optimizer_grouped_parameters = [{"params": model.parameters(), "weight_decay": config.adam.weight_decay}]
 | 
			
		||||
 | 
			
		||||
    return optimizer_grouped_parameters
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def seed_all(seed, cuda_deterministic=False):
 | 
			
		||||
    random.seed(seed)
 | 
			
		||||
    np.random.seed(seed)
 | 
			
		||||
    torch.manual_seed(seed)
 | 
			
		||||
    if torch.cuda.is_available():
 | 
			
		||||
        torch.cuda.manual_seed(seed)
 | 
			
		||||
        torch.cuda.manual_seed_all(seed)
 | 
			
		||||
    if cuda_deterministic:  # slower, more reproducible
 | 
			
		||||
        torch.backends.cudnn.deterministic = True
 | 
			
		||||
        torch.backends.cudnn.benchmark = False
 | 
			
		||||
    else:
 | 
			
		||||
        torch.backends.cudnn.deterministic = False
 | 
			
		||||
        torch.backends.cudnn.benchmark = True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def exam_hybrid_zero_optim_with_ddp(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size, zero_parallel, overlap_sync_param, overlap_sync_grad, micro_num, check_group, dtype = args
 | 
			
		||||
    # TODO: Need to test the combine of overlap param and group_params when ready
 | 
			
		||||
    # ParamBcastSyncHandler does not consider paramters in different optimizer group currently
 | 
			
		||||
    if overlap_sync_param and check_group:
 | 
			
		||||
        return
 | 
			
		||||
    config.parallel.zero1 = zero_parallel
 | 
			
		||||
    config.hybrid_zero_optimizer.overlap_sync_param = overlap_sync_param
 | 
			
		||||
    config.hybrid_zero_optimizer.overlap_sync_grad = overlap_sync_grad
 | 
			
		||||
    config.data.micro_num = micro_num
 | 
			
		||||
    config.model.dtype = dtype
 | 
			
		||||
    totel_step = 5
 | 
			
		||||
    if not overlap_sync_param:
 | 
			
		||||
        totel_step = 1
 | 
			
		||||
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
    seed_all(1024)
 | 
			
		||||
 | 
			
		||||
    # create models
 | 
			
		||||
    torch_model = MlpModel().cuda()
 | 
			
		||||
    zero_model = copy.deepcopy(torch_model).to(dtype)
 | 
			
		||||
    torch_model = DDP(torch_model.cuda(), static_graph=True).cuda()
 | 
			
		||||
 | 
			
		||||
    # create optimizer
 | 
			
		||||
    if config.hybrid_zero_optimizer.overlap_sync_param:
 | 
			
		||||
        param_bcast_sync_handler = ParamBcastSyncHandler(zero_model)
 | 
			
		||||
    else:
 | 
			
		||||
        param_bcast_sync_handler = None
 | 
			
		||||
 | 
			
		||||
    optimizer_grouped_parameters_zero = init_optimizer_grouped_parameters(check_group, zero_model)
 | 
			
		||||
    optimizer_grouped_parameters_torch = init_optimizer_grouped_parameters(check_group, torch_model)
 | 
			
		||||
 | 
			
		||||
    naive_optimizer = torch.optim.AdamW(
 | 
			
		||||
        params=optimizer_grouped_parameters_zero,
 | 
			
		||||
        lr=config.adam.lr,
 | 
			
		||||
        betas=(config.adam.adam_beta1, config.adam.adam_beta2),
 | 
			
		||||
        eps=config.adam.adam_eps,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    zero_optimizer = HybridZeroOptimizer(
 | 
			
		||||
        naive_optimizer,
 | 
			
		||||
        grad_scal_cfg=config.grad_scaler,
 | 
			
		||||
        zero_cfg=config.hybrid_zero_optimizer,
 | 
			
		||||
        param_bcast_sync_handler=param_bcast_sync_handler,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    torch_optimizer = torch.optim.AdamW(
 | 
			
		||||
        params=optimizer_grouped_parameters_torch,
 | 
			
		||||
        lr=config.adam.lr,
 | 
			
		||||
        betas=(config.adam.adam_beta1, config.adam.adam_beta2),
 | 
			
		||||
        eps=config.adam.adam_eps,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    for _ in range(totel_step):
 | 
			
		||||
        zero_optimizer.zero_grad()
 | 
			
		||||
        torch_optimizer.zero_grad()
 | 
			
		||||
        zero_optimizer.skip_grad_reduce = True
 | 
			
		||||
        for num in range(micro_num):
 | 
			
		||||
            if num == micro_num - 1:
 | 
			
		||||
                zero_optimizer.skip_grad_reduce = False
 | 
			
		||||
 | 
			
		||||
            seed_all(1024 + rank)
 | 
			
		||||
            # create input
 | 
			
		||||
            input_data = torch.rand(16, 128).cuda()
 | 
			
		||||
 | 
			
		||||
            # zero-dp forward
 | 
			
		||||
            zero_output = zero_model(input_data.to(dtype))
 | 
			
		||||
 | 
			
		||||
            # torch-ddp forward
 | 
			
		||||
            torch_output = torch_model(input_data)
 | 
			
		||||
 | 
			
		||||
            # check output
 | 
			
		||||
            loose_close(zero_output, torch_output, dtype=dtype)
 | 
			
		||||
 | 
			
		||||
            # zero-dp backward
 | 
			
		||||
            zero_optimizer.backward(zero_output.mean())
 | 
			
		||||
 | 
			
		||||
            # torch-ddp backward
 | 
			
		||||
            if num == micro_num - 1:
 | 
			
		||||
                torch_output.mean().backward()
 | 
			
		||||
            else:
 | 
			
		||||
                with torch_model.no_sync():
 | 
			
		||||
                    torch_output.mean().backward()
 | 
			
		||||
 | 
			
		||||
        # zero-dp step
 | 
			
		||||
        zero_optimizer.step()
 | 
			
		||||
 | 
			
		||||
        # torch-ddp step
 | 
			
		||||
        torch_optimizer.step()
 | 
			
		||||
 | 
			
		||||
        # check grad
 | 
			
		||||
        if check_group:
 | 
			
		||||
            group1 = zip(list(torch_model.parameters())[:2], list(zero_model.parameters())[:2])
 | 
			
		||||
            group2 = zip(list(torch_model.parameters())[2:], list(zero_model.parameters())[2:])
 | 
			
		||||
            for torch_parm, zero_parm in group1:
 | 
			
		||||
                if zero_parm.grad is not None:
 | 
			
		||||
                    loose_close(torch_parm.grad, zero_parm.grad, dtype=dtype)
 | 
			
		||||
            for torch_parm, zero_parm in group2:
 | 
			
		||||
                if zero_parm.grad is not None:
 | 
			
		||||
                    loose_close(torch_parm.grad, zero_parm.grad, dtype=dtype)
 | 
			
		||||
        else:
 | 
			
		||||
            for torch_parm, zero_parm in zip(torch_model.parameters(), zero_model.parameters()):
 | 
			
		||||
                if zero_parm.grad is not None:
 | 
			
		||||
                    loose_close(torch_parm.grad, zero_parm.grad, dtype=dtype)
 | 
			
		||||
 | 
			
		||||
    torch.cuda.synchronize()
 | 
			
		||||
    # check updated param
 | 
			
		||||
    if check_group:
 | 
			
		||||
        group1 = zip(list(torch_model.parameters())[:2], list(zero_model.parameters())[:2])
 | 
			
		||||
        group2 = zip(list(torch_model.parameters())[2:], list(zero_model.parameters())[2:])
 | 
			
		||||
        for torch_parm, zero_parm in group1:
 | 
			
		||||
            loose_close(torch_parm, zero_parm, dtype=dtype)
 | 
			
		||||
        for torch_parm, zero_parm in group2:
 | 
			
		||||
            loose_close(torch_parm, zero_parm, dtype=dtype)
 | 
			
		||||
    else:
 | 
			
		||||
        for torch_parm, zero_parm in zip(torch_model.parameters(), zero_model.parameters()):
 | 
			
		||||
            loose_close(torch_parm, zero_parm, dtype=dtype)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def exam_hybrid_zero_optim_with_ckpt_load_save(args):
 | 
			
		||||
    # init
 | 
			
		||||
    rank, world_size, zero_parallel, check_group, dtype = args
 | 
			
		||||
    config.parallel.zero1 = zero_parallel
 | 
			
		||||
    config.parallel.dtype = dtype
 | 
			
		||||
 | 
			
		||||
    build_environment(rank, world_size)
 | 
			
		||||
 | 
			
		||||
    # create models
 | 
			
		||||
    zero_model = MlpModel().cuda().to(dtype)
 | 
			
		||||
 | 
			
		||||
    # create optimizer
 | 
			
		||||
    if config.hybrid_zero_optimizer.overlap_sync_param:
 | 
			
		||||
        param_bcast_sync_handler = ParamBcastSyncHandler(zero_model)
 | 
			
		||||
    else:
 | 
			
		||||
        param_bcast_sync_handler = None
 | 
			
		||||
 | 
			
		||||
    optimizer_grouped_parameters1 = init_optimizer_grouped_parameters(check_group, zero_model)
 | 
			
		||||
    optimizer_grouped_parameters2 = init_optimizer_grouped_parameters(check_group, zero_model)
 | 
			
		||||
 | 
			
		||||
    naive_optimizer = torch.optim.AdamW(
 | 
			
		||||
        params=optimizer_grouped_parameters1,
 | 
			
		||||
        lr=config.adam.lr,
 | 
			
		||||
        betas=(config.adam.adam_beta1, config.adam.adam_beta2),
 | 
			
		||||
        eps=config.adam.adam_eps,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    zero_optimizer = HybridZeroOptimizer(
 | 
			
		||||
        naive_optimizer,
 | 
			
		||||
        grad_scal_cfg=config.grad_scaler,
 | 
			
		||||
        zero_cfg=config.hybrid_zero_optimizer,
 | 
			
		||||
        param_bcast_sync_handler=param_bcast_sync_handler,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    naive_optimizer2 = torch.optim.AdamW(
 | 
			
		||||
        params=optimizer_grouped_parameters2,
 | 
			
		||||
        lr=config.adam.lr,
 | 
			
		||||
        betas=(config.adam.adam_beta1, config.adam.adam_beta2),
 | 
			
		||||
        eps=config.adam.adam_eps,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    zero_optimizer2 = HybridZeroOptimizer(
 | 
			
		||||
        naive_optimizer2,
 | 
			
		||||
        grad_scal_cfg=config.grad_scaler,
 | 
			
		||||
        zero_cfg=config.hybrid_zero_optimizer,
 | 
			
		||||
        param_bcast_sync_handler=param_bcast_sync_handler,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # save and load states
 | 
			
		||||
    states = zero_optimizer.state_dict()
 | 
			
		||||
    zero_optimizer2.load_state_dict(states)
 | 
			
		||||
 | 
			
		||||
    # check fp32 model weights
 | 
			
		||||
    for zero1_param, zero2_param in zip(
 | 
			
		||||
        zero_optimizer._fp32_flat_param_groups_of_current_rank.values(),
 | 
			
		||||
        zero_optimizer2._fp32_flat_param_groups_of_current_rank.values(),
 | 
			
		||||
    ):
 | 
			
		||||
        assert torch.equal(zero1_param, zero2_param)
 | 
			
		||||
 | 
			
		||||
    # check fp16 model weights
 | 
			
		||||
    for zero1_param, zero2_param in zip(
 | 
			
		||||
        zero_optimizer._fp16_param_groups.values(), zero_optimizer2._fp16_param_groups.values()
 | 
			
		||||
    ):
 | 
			
		||||
        assert zero1_param == zero2_param
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
zero_parallel_check_list = [-1, 1, 4]
 | 
			
		||||
overlap_sync_param_check_list = [True, False]
 | 
			
		||||
overlap_sync_grad_check_list = [True, False]
 | 
			
		||||
miro_num_check_list = [1, 2, 4]
 | 
			
		||||
check_group_list = [True, False]
 | 
			
		||||
dtype_list = [torch.float32, torch.bfloat16]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("zero_parallel", zero_parallel_check_list)
 | 
			
		||||
@pytest.mark.parametrize("overlap_sync_param", overlap_sync_param_check_list)
 | 
			
		||||
@pytest.mark.parametrize("overlap_sync_grad", overlap_sync_grad_check_list)
 | 
			
		||||
@pytest.mark.parametrize("micro_num", miro_num_check_list)
 | 
			
		||||
@pytest.mark.parametrize("check_group", check_group_list)
 | 
			
		||||
@pytest.mark.parametrize("dtype", dtype_list)
 | 
			
		||||
def test_hybrid_zero_optim_with_ddp(
 | 
			
		||||
    zero_parallel, overlap_sync_param, overlap_sync_grad, micro_num, check_group, dtype
 | 
			
		||||
):
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(
 | 
			
		||||
            exam_hybrid_zero_optim_with_ddp,
 | 
			
		||||
            [
 | 
			
		||||
                [rank, 8, zero_parallel, overlap_sync_param, overlap_sync_grad, micro_num, check_group, dtype]
 | 
			
		||||
                for rank in range(8)
 | 
			
		||||
            ],
 | 
			
		||||
        )
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("zero_parallel", zero_parallel_check_list)
 | 
			
		||||
@pytest.mark.parametrize("check_group", check_group_list)
 | 
			
		||||
@pytest.mark.parametrize("dtype", dtype_list)
 | 
			
		||||
def test_hybrid_zero_optim_with_ckpt_load_save(zero_parallel, check_group, dtype):
 | 
			
		||||
    ctx = mp.get_context("spawn")
 | 
			
		||||
    with ctx.Pool(processes=8) as pool:
 | 
			
		||||
        pool.map(
 | 
			
		||||
            exam_hybrid_zero_optim_with_ckpt_load_save,
 | 
			
		||||
            [[rank, 8, zero_parallel, check_group, dtype] for rank in range(8)],
 | 
			
		||||
        )
 | 
			
		||||
        pool.close()
 | 
			
		||||
        pool.join()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    pytest.main(["-s", "-q", "test_optimizer.py"])
 | 
			
		||||
| 
						 | 
				
			
			@ -38,7 +38,7 @@ def convert2hf(model_config, states_tp_pps):
 | 
			
		|||
        current_states["lm_head.weight"] = states.pop("head.weight")
 | 
			
		||||
 | 
			
		||||
        for i in range(model_config["num_layers"]):
 | 
			
		||||
            states.pop(f"blocks.{i}.mixer.rotary_emb.inv_freq")
 | 
			
		||||
            states.pop(f"blocks.{i}.mixer.rotary_emb.inv_freq", None)
 | 
			
		||||
 | 
			
		||||
            wqkv = states.pop(f"blocks.{i}.mixer.Wqkv.weight").reshape(
 | 
			
		||||
                3, model_config["num_attention_heads"], -1, model_config["hidden_size"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,6 +20,7 @@
 | 
			
		|||
""" PyTorch InternLM model."""
 | 
			
		||||
import math
 | 
			
		||||
from typing import List, Optional, Tuple, Union
 | 
			
		||||
import threading, queue
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
import torch.utils.checkpoint
 | 
			
		||||
| 
						 | 
				
			
			@ -810,35 +811,70 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
 | 
			
		|||
                    temperature: float = 0.8,
 | 
			
		||||
                    top_p: float = 0.8,
 | 
			
		||||
                    **kwargs):
 | 
			
		||||
        """
 | 
			
		||||
        Return a generator in format: (response, history)
 | 
			
		||||
        Eg.
 | 
			
		||||
        ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
 | 
			
		||||
        ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
 | 
			
		||||
        """
 | 
			
		||||
 | 
			
		||||
        response_queue = queue.Queue(maxsize=20)
 | 
			
		||||
 | 
			
		||||
        class ChatStreamer(BaseStreamer):
 | 
			
		||||
            def __init__(self, tokenizer) -> None:
 | 
			
		||||
                super().__init__()
 | 
			
		||||
                self.tokenizer = tokenizer
 | 
			
		||||
                
 | 
			
		||||
                self.queue = response_queue
 | 
			
		||||
                self.query = query
 | 
			
		||||
                self.history = history
 | 
			
		||||
                self.response = ""
 | 
			
		||||
                self.received_inputs = False
 | 
			
		||||
                self.queue.put((self.response, history + [(self.query, self.response)]))
 | 
			
		||||
 | 
			
		||||
            def put(self, value):
 | 
			
		||||
                if len(value.shape) > 1 and value.shape[0] > 1:
 | 
			
		||||
                    raise ValueError("ChatStreamer only supports batch size 1")
 | 
			
		||||
                elif len(value.shape) > 1:
 | 
			
		||||
                    value = value[0]
 | 
			
		||||
 | 
			
		||||
                if not self.received_inputs:
 | 
			
		||||
                    # The first received value is input_ids, ignore here
 | 
			
		||||
                    self.received_inputs = True
 | 
			
		||||
                    return
 | 
			
		||||
 | 
			
		||||
                token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
 | 
			
		||||
                if token.strip() != "<eoa>":
 | 
			
		||||
                    print(token, end="")
 | 
			
		||||
                
 | 
			
		||||
                    self.response = self.response + token
 | 
			
		||||
                    history = self.history + [(self.query, self.response)]
 | 
			
		||||
                    self.queue.put((self.response, history))
 | 
			
		||||
 | 
			
		||||
            def end(self):
 | 
			
		||||
                print("")
 | 
			
		||||
            
 | 
			
		||||
        return self.chat(
 | 
			
		||||
            tokenizer=tokenizer,
 | 
			
		||||
            query=query,
 | 
			
		||||
            streamer=ChatStreamer(tokenizer=tokenizer),
 | 
			
		||||
            history=history, 
 | 
			
		||||
            max_new_tokens=max_new_tokens,
 | 
			
		||||
            do_sample=do_sample,
 | 
			
		||||
            temperature=temperature,
 | 
			
		||||
            top_p=top_p,
 | 
			
		||||
            **kwargs
 | 
			
		||||
        )
 | 
			
		||||
                
 | 
			
		||||
                self.queue.put(None)
 | 
			
		||||
 | 
			
		||||
        def stream_producer():
 | 
			
		||||
            return self.chat(
 | 
			
		||||
                tokenizer=tokenizer,
 | 
			
		||||
                query=query,
 | 
			
		||||
                streamer=ChatStreamer(tokenizer=tokenizer),
 | 
			
		||||
                history=history, 
 | 
			
		||||
                max_new_tokens=max_new_tokens,
 | 
			
		||||
                do_sample=do_sample,
 | 
			
		||||
                temperature=temperature,
 | 
			
		||||
                top_p=top_p,
 | 
			
		||||
                **kwargs
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        def consumer():
 | 
			
		||||
            producer = threading.Thread(target=stream_producer)
 | 
			
		||||
            producer.start()
 | 
			
		||||
            while True:
 | 
			
		||||
                res = response_queue.get()
 | 
			
		||||
                if res is not None:
 | 
			
		||||
                    return
 | 
			
		||||
                yield res
 | 
			
		||||
 | 
			
		||||
        return consumer()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@add_start_docstrings(
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue