mirror of https://github.com/InternLM/InternLM
docs(doc/code-docs): update quickstart usage (#301)
* docs(usage.md): update usage.md * docs(doc/code-docs): update en usage --------- Co-authored-by: huangting4201 <huangting3@sensetime.com>pull/314/head^2
parent
09e71cebf3
commit
42802a2b31
|
@ -8,7 +8,7 @@ msgid ""
|
|||
msgstr ""
|
||||
"Project-Id-Version: InternLM \n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2023-09-07 10:56+0800\n"
|
||||
"POT-Creation-Date: 2023-09-11 14:25+0800\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language: en\n"
|
||||
|
@ -19,30 +19,30 @@ msgstr ""
|
|||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"Generated-By: Babel 2.12.1\n"
|
||||
|
||||
#: ../../source/checkpoint.rst:2 09c8645fba264cdf9a80c4b62c2bb4d1
|
||||
#: ../../source/checkpoint.rst:2
|
||||
msgid "模型保存"
|
||||
msgstr "Model Checkpointing"
|
||||
|
||||
#: ../../source/checkpoint.rst:4 8b158d34631045b1afdb4fb0169b3c71
|
||||
#: ../../source/checkpoint.rst:4
|
||||
msgid ""
|
||||
"InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` "
|
||||
"来管理模型保存。 其中,可以 使用 ``CheckpointManager.try_save_checkpoint(train_state)`` "
|
||||
"来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。"
|
||||
msgstr ""
|
||||
"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation, "
|
||||
"we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports "
|
||||
"automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit."
|
||||
"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to "
|
||||
"manage model checkpointing. In the implementation, we use "
|
||||
"``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint "
|
||||
"training states at specific steps. InternLM supports automatic loading of"
|
||||
" latest ckpt at startup and automatic model checkpointing at signal quit."
|
||||
|
||||
#: ../../source/checkpoint.rst:8 a023b5a6d15749bfaa51cf2da194bda1
|
||||
#: ../../source/checkpoint.rst:8
|
||||
msgid "Checkpointing"
|
||||
msgstr ""
|
||||
|
||||
#: 938575c699d1426c87e0b3f589a85d50
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager:1 of
|
||||
msgid "StorageManagerContext"
|
||||
msgstr ""
|
||||
|
||||
#: 754d6881cd034c5ebaab0f3362dd14c2
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:1 of
|
||||
msgid ""
|
||||
"Exit signal detection function, if we write the exit step in the "
|
||||
|
@ -51,34 +51,27 @@ msgid ""
|
|||
"quit."
|
||||
msgstr ""
|
||||
|
||||
#: 2169f9fb4a8b40bc9bf6093894fc7a5e 6a55d2b2b24a44c8b78b40f19f4d950b
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training of
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
|
||||
msgid "参数"
|
||||
msgstr ""
|
||||
|
||||
#: 360a89b1591e4627ac432f4d75050354
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
|
||||
msgid "返回"
|
||||
msgstr ""
|
||||
|
||||
#: 2426832f4a8a4c5481be1c940e0e7b50
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:9 of
|
||||
msgid "whether to quit."
|
||||
msgstr ""
|
||||
|
||||
#: 5f6842c261544a3c89f32d981b3ad755
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
|
||||
msgid "返回类型"
|
||||
msgstr ""
|
||||
|
||||
#: 1392da84b6e645bcb8dab605e1231fdc
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.wait_async_upload_finish:1
|
||||
#: of
|
||||
msgid "wait for all checkpoint uploads to be completed"
|
||||
msgstr ""
|
||||
|
||||
#: d1774593e9c94608b49b10504bfbc38b
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.query_latest_snapshot_step_boto3:1
|
||||
#: of
|
||||
msgid ""
|
||||
|
@ -86,38 +79,25 @@ msgid ""
|
|||
"found, None will return."
|
||||
msgstr ""
|
||||
|
||||
#: a3abbbd2bd574872892d908ab248e804
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:1 of
|
||||
msgid "Attempt to restore the training state of the last ckpt."
|
||||
msgstr ""
|
||||
|
||||
#: de021d1eb6d54955a2850c11c0191710
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:3 of
|
||||
msgid "lr_scheduler object."
|
||||
msgstr ""
|
||||
|
||||
#: 20be15854f2e420a9d96c86b5869bfa6
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:5 of
|
||||
msgid "optimizer object."
|
||||
msgstr ""
|
||||
|
||||
#: 68f69086c5054acc8aca15c8a764acc5
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:7 of
|
||||
msgid "learning rate."
|
||||
msgstr ""
|
||||
|
||||
#: 5d34d34a972d4abeab4bda3e49ee157b
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:9 of
|
||||
msgid "traing states."
|
||||
msgstr ""
|
||||
|
||||
#: 82ebb67afaa748ecabc4cef598d7fc30
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:11 of
|
||||
msgid "traning dataloader object"
|
||||
msgstr ""
|
||||
|
||||
#: 0c95dfcd712749279daca78166bb4326
|
||||
#: internlm.utils.model_checkpoint.CheckpointManager.save_checkpoint:1 of
|
||||
msgid "Save checkpoint to the given folder path."
|
||||
msgstr ""
|
||||
|
||||
#~ msgid "Attempt to restore the training state of the last ckpt."
|
||||
#~ msgstr ""
|
||||
|
||||
#~ msgid "lr_scheduler object."
|
||||
#~ msgstr ""
|
||||
|
||||
#~ msgid "optimizer object."
|
||||
#~ msgstr ""
|
||||
|
||||
#~ msgid "learning rate."
|
||||
#~ msgstr ""
|
||||
|
||||
#~ msgid "traing states."
|
||||
#~ msgstr ""
|
||||
|
||||
#~ msgid "traning dataloader object"
|
||||
#~ msgstr ""
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ msgid ""
|
|||
msgstr ""
|
||||
"Project-Id-Version: InternLM \n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2023-09-07 14:15+0800\n"
|
||||
"POT-Creation-Date: 2023-09-11 14:25+0800\n"
|
||||
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
|
||||
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
|
||||
"Language: en\n"
|
||||
|
@ -19,11 +19,11 @@ msgstr ""
|
|||
"Content-Transfer-Encoding: 8bit\n"
|
||||
"Generated-By: Babel 2.12.1\n"
|
||||
|
||||
#: ../../../usage.md:2 a64aaaa1525e4e01b0ddcebc42c24bbd
|
||||
#: ../../../usage.md:2
|
||||
msgid "使用教程"
|
||||
msgstr "Quickstart Guide"
|
||||
|
||||
#: ../../../usage.md:4 f1b40737fb584d889b82c7f55b652977
|
||||
#: ../../../usage.md:4
|
||||
msgid ""
|
||||
"启动一个 Demo "
|
||||
"模型训练,需要进行三项准备,**安装**,**数据集准备**和**模型训练配置**。接下来,首先会介绍数据准备相关的操作,再简要描述模型训练配置相关的内容。"
|
||||
|
@ -33,21 +33,21 @@ msgstr ""
|
|||
"configuration**. In this guide, we will first cover the steps for dataset"
|
||||
" preparation and then briefly describe the model training configuration."
|
||||
|
||||
#: ../../../usage.md:6 b35abe307c2f4d23866fff828308ebf2
|
||||
#: ../../../usage.md:6
|
||||
msgid "安装"
|
||||
msgstr "Installation"
|
||||
|
||||
#: ../../../usage.md:7 64a8c1f5f71c45519e636aa7edba10bc
|
||||
#: ../../../usage.md:7
|
||||
msgid "请参考[安装文档](./install.md)进行安装。"
|
||||
msgstr ""
|
||||
"Please refer to the [installation guide](./install.md) for instructions "
|
||||
"on how to install the necessary dependencies."
|
||||
|
||||
#: ../../../usage.md:9 bd96714d12ee415794dea5a4578bd8cd
|
||||
#: ../../../usage.md:9
|
||||
msgid "数据准备 (预训练)"
|
||||
msgstr "Dataset Preparation (Pre-training)"
|
||||
|
||||
#: ../../../usage.md:11 5a0b39fb9da94e96b87db40d1f231a0c
|
||||
#: ../../../usage.md:11
|
||||
msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型,可直接修改`tokernizer.py`中的模型参数路径。"
|
||||
msgstr ""
|
||||
"The dataset for the InternLM training task includes a series of `bin` and"
|
||||
|
@ -58,7 +58,7 @@ msgstr ""
|
|||
"different model, you can directly modify the model parameter path in "
|
||||
"`tokenizer.py`."
|
||||
|
||||
#: ../../../usage.md:13 3cef8126b8784af48d81cc140322909e
|
||||
#: ../../../usage.md:13
|
||||
msgid "可以运行以下命令生成原始数据对应的`bin`和`meta`文件,其中参数`text_input_path`表示原始文本数据路径,目前支持`txt`、`json`和`jsonl`三种输入格式,`bin_output_path`表示生成的`bin`文件的保存路径。"
|
||||
msgstr ""
|
||||
"You can run the following command to generate `bin` and `meta` files "
|
||||
|
@ -67,30 +67,30 @@ msgstr ""
|
|||
"`txt`, `json`, and `jsonl` formats, while `bin_output_path` represents "
|
||||
"the save path of the generated `bin` files."
|
||||
|
||||
#: ../../../usage.md:18 107ff2280da14cb6a27f4e9857186333
|
||||
#: ../../../usage.md:18
|
||||
msgid "下面是一个数据处理的例子:"
|
||||
msgstr "Here is an example of data processing:"
|
||||
|
||||
#: ../../../usage.md:20 c11a9860263c4e2288a561f3435fa706
|
||||
#: ../../../usage.md:20
|
||||
msgid "给定一个包含原始数据集的文件`raw_data.txt`,原始数据集如下所示:"
|
||||
msgstr ""
|
||||
"Given a file `raw_data.txt` containing the raw dataset, the raw dataset "
|
||||
"is shown below:"
|
||||
|
||||
#: ../../../usage.md:27 4012599b42ab47bd979d2a0b79ca1147
|
||||
#: ../../../usage.md:27
|
||||
msgid "可以通过运行以下命令来生成`bin`和`meta`文件:"
|
||||
msgstr ""
|
||||
"You can generate the `bin` and `meta` files by running the following "
|
||||
"command:"
|
||||
|
||||
#: ../../../usage.md:32 cca91b6cf53a4082932dd34ea4b7f954
|
||||
#: ../../../usage.md:32
|
||||
msgid "需要注意的是,生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这六个目录下,以区分数据集的类型。"
|
||||
msgstr ""
|
||||
"It should be noted that the generated `bin` files need to be saved in one"
|
||||
" of the following directories: `cn`, `en`, `code`, `ja`, `ar`, or "
|
||||
"`kaoshi`, depending on the type of dataset."
|
||||
|
||||
#: ../../../usage.md:34 417312ca1e35479e811953f777e3565a
|
||||
#: ../../../usage.md:34
|
||||
msgid "其中,`cn`表示中文数据集;`en`表示英文数据集;`code`表示代码数据集;`ja`表示日语数据集;`ar`表示阿拉伯语数据集;`kaoshi`表示考试数据集。"
|
||||
msgstr ""
|
||||
"Here, `cn` represents the Chinese dataset, `en` represents the English "
|
||||
|
@ -98,22 +98,22 @@ msgstr ""
|
|||
" dataset, `ar` represents the Arabic dataset, and `kaoshi` represents the"
|
||||
" exam dataset."
|
||||
|
||||
#: ../../../usage.md:36 79c21f8e89b34499ba4e25e20593ec28
|
||||
#: ../../../usage.md:36
|
||||
msgid "生成的bin文件的格式如下:"
|
||||
msgstr "The format of the generated `bin` files is as follows:"
|
||||
|
||||
#: ../../../usage.md:42 26388d996c4e4116bc216be9bc007f62
|
||||
#: ../../../usage.md:42
|
||||
msgid "`bin`文件中的每一行均对应原始数据集中的每一个句子,表示每个句子的`token`(下文将用sequence指定)。"
|
||||
msgstr ""
|
||||
"Each line in the `bin` file corresponds to each sentence in the original "
|
||||
"dataset, representing the tokens of each sentence (referred to as "
|
||||
"sequence below)."
|
||||
|
||||
#: ../../../usage.md:44 b39148a85ee64a349975d26282fbe59b
|
||||
#: ../../../usage.md:44
|
||||
msgid "生成的`meta`文件的格式如下:"
|
||||
msgstr "The format of the generated `meta` file is as follows:"
|
||||
|
||||
#: ../../../usage.md:48 175a6007197a40568535f945672e5df2
|
||||
#: ../../../usage.md:48
|
||||
msgid ""
|
||||
"在`meta`文件中,每个元组对应着`bin`文件中每一个`sequence`的元信息。其中,元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting"
|
||||
" index`,第二个元素表示每个`sequence`中有多少个`tokens`。"
|
||||
|
@ -123,7 +123,7 @@ msgstr ""
|
|||
"index` of each `sequence` among all `sequences`, and the second element "
|
||||
"indicates the number of `tokens` for each `sequence`."
|
||||
|
||||
#: ../../../usage.md:50 46874a3de3924837979f9949f1237e39
|
||||
#: ../../../usage.md:50
|
||||
msgid ""
|
||||
"例如,对于第一个`sequence`,`starting index`为 0,有 11 "
|
||||
"个`tokens`;对于第二个`sequence`,由于第一个`sequence`转换为`string`后的长度为`89`,因此它的`starting"
|
||||
|
@ -132,17 +132,17 @@ msgstr ""
|
|||
"For example, the first `sequence` starts at index 0 and has 16 `tokens`. "
|
||||
"The second `sequence` starts at index 110 and has 24 `tokens`."
|
||||
|
||||
#: ../../../usage.md:52 25ea049fa411408b8856e7aa657835ab
|
||||
#: ../../../usage.md:52
|
||||
msgid "`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致,此处不再赘叙。"
|
||||
msgstr ""
|
||||
"The `bin` and `meta` file formats for `json` and `jsonl` type files are "
|
||||
"the same as for `txt`, so we won't go over them here."
|
||||
|
||||
#: ../../../usage.md:54 bc52f959cb57494483a181e843014ed1
|
||||
#: ../../../usage.md:54
|
||||
msgid "数据准备 (微调)"
|
||||
msgstr "Data Preparation (Fine-tuning)"
|
||||
|
||||
#: ../../../usage.md:56 73c74620c2994486acc747ba0c7f0b46
|
||||
#: ../../../usage.md:56
|
||||
msgid ""
|
||||
"微调任务的数据集格式与预训练任务保持一致,生成的数据格式为一系列的`bin`和`meta`文件。以下以 Alpaca "
|
||||
"数据集为例,介绍微调的数据准备流程。"
|
||||
|
@ -152,7 +152,7 @@ msgstr ""
|
|||
"the Alpaca dataset as an example to explain the data preparation process "
|
||||
"for fine-tuning."
|
||||
|
||||
#: ../../../usage.md:58 75f0e22d10ca413389ec8b947ae6141f
|
||||
#: ../../../usage.md:58
|
||||
msgid ""
|
||||
"下载 [Alpaca 数据集](https://github.com/tatsu-"
|
||||
"lab/stanford_alpaca/blob/main/alpaca_data.json)"
|
||||
|
@ -160,87 +160,86 @@ msgstr ""
|
|||
"Download the [Alpaca dataset](https://github.com/tatsu-"
|
||||
"lab/stanford_alpaca/blob/main/alpaca_data.json)."
|
||||
|
||||
#: ../../../usage.md:60 667606fcea454af48353a5b40f82fc46
|
||||
#: ../../../usage.md:60
|
||||
msgid "对 Alpaca 数据进行 tokenize,使用以下命令"
|
||||
msgstr "Tokenize the Alpaca dataset using the following command:"
|
||||
|
||||
#: ../../../usage.md:66 60283b9237c8462ea37288b8ece79081
|
||||
#: ../../../usage.md:66
|
||||
msgid "建议用户参考 alpaca_tokenizer.py 编写新的脚本对自己的数据集进行 tokenize"
|
||||
msgstr ""
|
||||
"It is recommended that users refer to alpaca_tokenizer.py to write new "
|
||||
"scripts to tokenize their own datasets"
|
||||
|
||||
#: ../../../usage.md:68 cdf45a4de9874e9fb65f7104dcee3c61
|
||||
#: ../../../usage.md:68
|
||||
msgid "训练配置"
|
||||
msgstr "Training Configuration"
|
||||
|
||||
#: ../../../usage.md:70 7c42ebc23246450cbc1270e1461b16f6
|
||||
msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例,介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。"
|
||||
#: ../../../usage.md:70
|
||||
#, fuzzy
|
||||
msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例:"
|
||||
msgstr ""
|
||||
"Taking the configuration file `configs/7B_sft.py` for the 7B demo as an "
|
||||
"example, let's discuss the data, model, and parallel configurations "
|
||||
"example,"
|
||||
|
||||
#: ../../../usage.md:237
|
||||
msgid "接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。"
|
||||
msgstr "let's discuss the data, model, parallel and monitoring configurations "
|
||||
"required to start a model training."
|
||||
|
||||
#: ../../../usage.md:72 247cfe98a7f44c2293aa2e2351f1ea69
|
||||
#: ../../../usage.md:239
|
||||
msgid "数据配置"
|
||||
msgstr "Data Configuration"
|
||||
|
||||
#: ../../../usage.md:73 31327e7dce5848778db5361b3fbded1c
|
||||
#: ../../../usage.md:240
|
||||
msgid "数据相关的关键参数配置及释义如下所示:"
|
||||
msgstr "Here are the key parameters and their explanations for data configuration:"
|
||||
|
||||
#: ../../../usage.md:88 4d2608136fef4141bd6e47f78b8591b2
|
||||
#: ../../../usage.md:255
|
||||
msgid ""
|
||||
msgstr ""
|
||||
|
||||
#: ../../../usage.md:88 c5acb028f2694712b2af788a864d5927
|
||||
#: ../../../usage.md:255
|
||||
msgid "pack_into_one"
|
||||
msgstr ""
|
||||
|
||||
#: ../../../usage.md:91 db6b9ce8e8294952845893dd7aad098f
|
||||
#: ../../../usage.md:258
|
||||
msgid "目前支持传入数据集文件路径`train_folder`,且要求文件格式如下:"
|
||||
msgstr ""
|
||||
"Currently, it supports passing the dataset file path `train_folder`, and "
|
||||
"the file format is required to be as follows:"
|
||||
|
||||
#: ../../../usage.md:98 f22536fc3dfa4552a103a7cb57a20f92
|
||||
#: ../../../usage.md:265
|
||||
msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。"
|
||||
msgstr ""
|
||||
"For detailed information about the dataset, please refer to the \"Data "
|
||||
"Preparation\" section."
|
||||
|
||||
#: ../../../usage.md:100 bc4f0b06e9c24730a7a831b7aca417e2
|
||||
#: ../../../usage.md:267
|
||||
msgid "模型配置"
|
||||
msgstr "Model Configuration"
|
||||
|
||||
#: ../../../usage.md:102 ecf278a0a851496fae2e49c436e59368
|
||||
#: ../../../usage.md:269
|
||||
msgid "如果在启动训练时要加载模型 `checkpoint`,可进行如下相关配置:"
|
||||
msgstr ""
|
||||
"If you want to load a model checkpoint when starting the training, you "
|
||||
"can configure it as follows:"
|
||||
|
||||
#: ../../../usage.md:115 38244aba74294067a4019d0777621746
|
||||
#: ../../../usage.md:282
|
||||
msgid "注意:"
|
||||
msgstr "Note:"
|
||||
|
||||
#: ../../../usage.md:116 19d1eb0a797f4bd9a702a00e525d7753
|
||||
msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
|
||||
msgstr ""
|
||||
"`load_model_only_folder` and `load_ckpt_folder` cannot be set at the same"
|
||||
" time."
|
||||
|
||||
#: ../../../usage.md:117 3ea27a1f6be044a3959890be69311b24
|
||||
#: ../../../usage.md:283
|
||||
msgid "路径若以 `local:` 为前缀,则存储在本地文件系统;若以 `boto3:` 为前缀,则存储在远程 oss 上"
|
||||
msgstr ""
|
||||
"If the path starts with `local:`, it means the file is stored in the "
|
||||
"local file system. If it starts with `boto3:`, it means the file is "
|
||||
"stored in the remote OSS."
|
||||
|
||||
#: ../../../usage.md:119 1d6381b4cfff41d8bdd5347e8a135869
|
||||
#: ../../../usage.md:285
|
||||
msgid "模型相关关键参数配置如下所示:"
|
||||
msgstr "The configuration for the model is as follows:"
|
||||
|
||||
#: ../../../usage.md:143 1026791c9f054576857ef1930db6b167
|
||||
#: ../../../usage.md:309
|
||||
msgid "注意:用户可自定义模型类型名和模型结构,并配置相对应的模型参数。通过`utils/registry.py`下的`MODEL_INITIALIZER`对象进行模型初始化函数接口注册,在训练主函数`train.py`中初始化模型时,可通过`model_type`配置获取指定的模型初始化接口函数。"
|
||||
msgstr ""
|
||||
"Note: Users can customize the model type name and model structure, and "
|
||||
|
@ -251,7 +250,7 @@ msgstr ""
|
|||
"interface function can be obtained through the `model_type` "
|
||||
"configuration."
|
||||
|
||||
#: ../../../usage.md:145 34823bcbe7754190bc9747758c1aad0c
|
||||
#: ../../../usage.md:311
|
||||
msgid ""
|
||||
"*如果基于 InternLM 7B继续训练,可以参考 "
|
||||
"[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 "
|
||||
|
@ -261,21 +260,21 @@ msgstr ""
|
|||
"OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-"
|
||||
"zoo) to download weights*."
|
||||
|
||||
#: ../../../usage.md:147 4cabc928f8884cd38a6bb683b3bfade3
|
||||
#: ../../../usage.md:313
|
||||
msgid "并行配置"
|
||||
msgstr "Parallel Configuration"
|
||||
|
||||
#: ../../../usage.md:149 f97ade07340340959345e73567bae793
|
||||
#: ../../../usage.md:315
|
||||
msgid "训练并行配置样例如下:"
|
||||
msgstr "Training parallel configuration example:"
|
||||
|
||||
#: ../../../usage.md:158 87fb5a4e4a4047ee8a9b8bb43915636d
|
||||
#: ../../../usage.md:324
|
||||
msgid "zero1:zero 并行策略,分如下三种情况,默认值为 -1"
|
||||
msgstr ""
|
||||
"zero1: zero parallel strategy, divided into the following three cases, "
|
||||
"default value is -1"
|
||||
|
||||
#: ../../../usage.md:159 58dc08e2c52e4aaba99b4fbb6cf2e8b4
|
||||
#: ../../../usage.md:325
|
||||
#, fuzzy
|
||||
msgid "当`zero1 <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配"
|
||||
msgstr ""
|
||||
|
@ -283,57 +282,57 @@ msgstr ""
|
|||
"size of the data parallel process group, so the optimizer state "
|
||||
"parameters will be split within the data parallel range."
|
||||
|
||||
#: ../../../usage.md:160 67e2ebd795d840b29fd1d684a068e90d
|
||||
#: ../../../usage.md:326
|
||||
#, fuzzy
|
||||
msgid "当`zero1 == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数"
|
||||
msgstr ""
|
||||
"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain "
|
||||
"the complete optimizer state parameters."
|
||||
"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain"
|
||||
" the complete optimizer state parameters."
|
||||
|
||||
#: ../../../usage.md:161 7caedfc943514b9b83090b858ef6d163
|
||||
#: ../../../usage.md:327
|
||||
#, fuzzy
|
||||
msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集"
|
||||
msgstr ""
|
||||
"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process"
|
||||
" group is a subset of the data parallel process group."
|
||||
"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 "
|
||||
"process group is a subset of the data parallel process group."
|
||||
|
||||
#: ../../../usage.md:162 b38d3a1f72d543c6a44728fb6babea6b
|
||||
#: ../../../usage.md:328
|
||||
msgid "tensor:张量并行大小,通常是每个节点的 GPU 数量,默认值为 1"
|
||||
msgstr ""
|
||||
"tensor: tensor parallel size, usually the number of GPUs per node, "
|
||||
"default is 1"
|
||||
|
||||
#: ../../../usage.md:163 237ac76df68f4a999396dad37c5495c3
|
||||
#: ../../../usage.md:329
|
||||
msgid "pipeline:流水线并行策略"
|
||||
msgstr "pipeline: pipeline parallel strategy"
|
||||
|
||||
#: ../../../usage.md:164 c8c38f6ab2ea432eb9ebbb62618ca33e
|
||||
#: ../../../usage.md:330
|
||||
msgid "size:流水线并行大小,默认值为 1"
|
||||
msgstr "size: pipeline parallel size, the default value is 1"
|
||||
|
||||
#: ../../../usage.md:165 b9158818e72e49acbdd52ad317cb80df
|
||||
#: ../../../usage.md:331
|
||||
msgid "interleaved_overlap:bool 类型,交错式调度时,开启或关闭通信优化,默认值为关闭"
|
||||
msgstr ""
|
||||
"interleaved_overlap: bool type, when interleaved scheduling, enable or "
|
||||
"disable communication optimization, the default value is False"
|
||||
|
||||
#: ../../../usage.md:166 28e4d48661ff4f80aff788fdda604433
|
||||
#: ../../../usage.md:332
|
||||
msgid "sequence_parallel:是否开启序列化并行,默认值为 False"
|
||||
msgstr ""
|
||||
"sequence_parallel: Whether to enable sequence parallelism, the default "
|
||||
"value is False"
|
||||
|
||||
#: ../../../usage.md:168 27528ab826824d2280506460e1f2f7bd
|
||||
#: ../../../usage.md:334
|
||||
msgid "注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`"
|
||||
msgstr ""
|
||||
"Note: `Data parallel size = Total number of GPUs / Pipeline parallel size"
|
||||
" / Tensor parallel size`"
|
||||
|
||||
#: ../../../usage.md:170 5a7af23cec604f1d9096a5ab81993c87
|
||||
#: ../../../usage.md:336
|
||||
msgid "启动训练"
|
||||
msgstr "Start Training"
|
||||
|
||||
#: ../../../usage.md:172 795e51542ed84cea83b63c5233bb88bc
|
||||
#: ../../../usage.md:338
|
||||
msgid "完成了以上数据集准备和相关训练配置后,可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例,介绍训练启动方式。"
|
||||
msgstr ""
|
||||
"After completing the data preparation and relevant training "
|
||||
|
@ -341,25 +340,30 @@ msgstr ""
|
|||
"following examples demonstrate how to start the training in both slurm "
|
||||
"and torch environments."
|
||||
|
||||
#: ../../../usage.md:174 96402cbe443044c0a0a1695c9847140b
|
||||
#: ../../../usage.md:340
|
||||
msgid "若在 slurm 上启动分布式运行环境,多节点 16 卡的运行命令如下所示:"
|
||||
msgstr ""
|
||||
"If you want to start distributed training on slurm with 16 GPUs across "
|
||||
"multiple nodes, use the following command:"
|
||||
|
||||
#: ../../../usage.md:179 c569e60401a6471eb9af2473acc4d5a6
|
||||
#: ../../../usage.md:345
|
||||
msgid "若在 torch 上启动分布式运行环境,单节点 8 卡的运行命令如下所示:"
|
||||
msgstr ""
|
||||
"If you want to start distributed training on torch with 8 GPUs on a "
|
||||
"single node, use the following command:"
|
||||
|
||||
#: ../../../usage.md:184 a045a060d0734aab9d894aed553cef34
|
||||
#: ../../../usage.md:350
|
||||
msgid "运行结果"
|
||||
msgstr "Training Results"
|
||||
|
||||
#: ../../../usage.md:186 c68e8dfa259647c7a6e6e0c0446b0b18
|
||||
#: ../../../usage.md:352
|
||||
msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例,训练结果日志展示如下:"
|
||||
msgstr ""
|
||||
"Taking the configuration of the demo training on a single machine with 8 "
|
||||
"GPUs on slurm as an example, the training result log is shown below:"
|
||||
|
||||
#~ msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
|
||||
#~ msgstr ""
|
||||
#~ "`load_model_only_folder` and `load_ckpt_folder` "
|
||||
#~ "cannot be set at the same time."
|
||||
|
||||
|
|
168
doc/en/usage.md
168
doc/en/usage.md
|
@ -74,7 +74,173 @@ It is recommended that users refer to alpaca_tokenizer.py to write new scripts t
|
|||
|
||||
### Training Configuration
|
||||
|
||||
Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, and parallel configurations required to start a model training.
|
||||
Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, parallel and monitoring configurations required to start a model training.
|
||||
```python
|
||||
JOB_NAME = "7b_train"
|
||||
DO_ALERT = False
|
||||
|
||||
SEQ_LEN = 2048
|
||||
HIDDEN_SIZE = 4096
|
||||
NUM_ATTENTION_HEAD = 32
|
||||
MLP_RATIO = 8 / 3
|
||||
NUM_LAYER = 32
|
||||
VOCAB_SIZE = 103168
|
||||
|
||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
||||
# Ckpt folder format:
|
||||
# fs: 'local:/mnt/nfs/XXX'
|
||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
||||
|
||||
# boto3 Ckpt folder format:
|
||||
# import os
|
||||
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
|
||||
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
|
||||
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
|
||||
CHECKPOINT_EVERY = 50
|
||||
ckpt = dict(
|
||||
enable_save_ckpt=False, # enable ckpt save.
|
||||
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
|
||||
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
|
||||
load_ckpt_folder="local:llm_ckpts/",
|
||||
# 'load_ckpt_info' setting guide:
|
||||
# 1. the 'path' indicate ckpt path,
|
||||
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
|
||||
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
|
||||
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
|
||||
checkpoint_every=CHECKPOINT_EVERY,
|
||||
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
|
||||
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
|
||||
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
|
||||
)
|
||||
|
||||
TRAIN_FOLDER = "/path/to/dataset"
|
||||
VALID_FOLDER = "/path/to/dataset"
|
||||
data = dict(
|
||||
seq_len=SEQ_LEN,
|
||||
# micro_num means the number of micro_batch contained in one gradient update
|
||||
micro_num=4,
|
||||
# packed_length = micro_bsz * SEQ_LEN
|
||||
micro_bsz=2,
|
||||
# defaults to the value of micro_num
|
||||
valid_micro_num=4,
|
||||
# defaults to 0, means disable evaluate
|
||||
valid_every=50,
|
||||
pack_sample_into_one=False,
|
||||
total_steps=50000,
|
||||
skip_batches="",
|
||||
rampup_batch_size="",
|
||||
# Datasets with less than 50 rows will be discarded
|
||||
min_length=50,
|
||||
# train_folder=TRAIN_FOLDER,
|
||||
# valid_folder=VALID_FOLDER,
|
||||
empty_cache_and_diag_interval=10,
|
||||
diag_outlier_ratio=1.1,
|
||||
)
|
||||
|
||||
grad_scaler = dict(
|
||||
fp16=dict(
|
||||
# the initial loss scale, defaults to 2**16
|
||||
initial_scale=2**16,
|
||||
# the minimum loss scale, defaults to None
|
||||
min_scale=1,
|
||||
# the number of steps to increase loss scale when no overflow occurs
|
||||
growth_interval=1000,
|
||||
),
|
||||
# the multiplication factor for increasing loss scale, defaults to 2
|
||||
growth_factor=2,
|
||||
# the multiplication factor for decreasing loss scale, defaults to 0.5
|
||||
backoff_factor=0.5,
|
||||
# the maximum loss scale, defaults to None
|
||||
max_scale=2**24,
|
||||
# the number of overflows before decreasing loss scale, defaults to 2
|
||||
hysteresis=2,
|
||||
)
|
||||
|
||||
hybrid_zero_optimizer = dict(
|
||||
# Enable low_level_optimzer overlap_communication
|
||||
overlap_sync_grad=True,
|
||||
overlap_sync_param=True,
|
||||
# bucket size for nccl communication params
|
||||
reduce_bucket_size=512 * 1024 * 1024,
|
||||
# grad clipping
|
||||
clip_grad_norm=1.0,
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
label_smoothing=0,
|
||||
)
|
||||
|
||||
adam = dict(
|
||||
lr=1e-4,
|
||||
adam_beta1=0.9,
|
||||
adam_beta2=0.95,
|
||||
adam_beta2_c=0,
|
||||
adam_eps=1e-8,
|
||||
weight_decay=0.01,
|
||||
)
|
||||
|
||||
lr_scheduler = dict(
|
||||
total_steps=data["total_steps"],
|
||||
init_steps=0, # optimizer_warmup_step
|
||||
warmup_ratio=0.01,
|
||||
eta_min=1e-5,
|
||||
last_epoch=-1,
|
||||
)
|
||||
|
||||
beta2_scheduler = dict(
|
||||
init_beta2=adam["adam_beta2"],
|
||||
c=adam["adam_beta2_c"],
|
||||
cur_iter=-1,
|
||||
)
|
||||
|
||||
model = dict(
|
||||
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
|
||||
num_attention_heads=NUM_ATTENTION_HEAD,
|
||||
embed_split_hidden=True,
|
||||
vocab_size=VOCAB_SIZE,
|
||||
embed_grad_scale=1,
|
||||
parallel_output=True,
|
||||
hidden_size=HIDDEN_SIZE,
|
||||
num_layers=NUM_LAYER,
|
||||
mlp_ratio=MLP_RATIO,
|
||||
apply_post_layer_norm=False,
|
||||
dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
|
||||
norm_type="rmsnorm",
|
||||
layer_norm_epsilon=1e-5,
|
||||
use_flash_attn=True,
|
||||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
||||
)
|
||||
"""
|
||||
zero1 parallel:
|
||||
1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
|
||||
so parameters will be divided within the range of dp.
|
||||
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
||||
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
||||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
||||
pipeline parallel (dict):
|
||||
1. size: int, the size of pipeline parallel.
|
||||
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
|
||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
||||
"""
|
||||
parallel = dict(
|
||||
zero1=8,
|
||||
pipeline=dict(size=1, interleaved_overlap=True),
|
||||
sequence_parallel=False,
|
||||
)
|
||||
|
||||
cudnn_deterministic = False
|
||||
cudnn_benchmark = False
|
||||
|
||||
monitor = dict(
|
||||
# feishu alert configs
|
||||
alert=dict(
|
||||
enable_feishu_alert=DO_ALERT,
|
||||
feishu_alert_address=None, # feishu webhook to send alert message
|
||||
light_monitor_address=None, # light_monitor address to send heartbeat
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
#### Data Configuration
|
||||
Here are the key parameters and their explanations for data configuration:
|
||||
|
|
169
doc/usage.md
169
doc/usage.md
|
@ -66,7 +66,174 @@ python tools/alpaca_tokenizer.py /path/to/alpaca_dataset /path/to/output_dataset
|
|||
|
||||
### 训练配置
|
||||
|
||||
以 7B Demo 的配置文件`configs/7B_sft.py`为例,介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。
|
||||
以 7B Demo 的配置文件`configs/7B_sft.py`为例:
|
||||
```python
|
||||
JOB_NAME = "7b_train"
|
||||
DO_ALERT = False
|
||||
|
||||
SEQ_LEN = 2048
|
||||
HIDDEN_SIZE = 4096
|
||||
NUM_ATTENTION_HEAD = 32
|
||||
MLP_RATIO = 8 / 3
|
||||
NUM_LAYER = 32
|
||||
VOCAB_SIZE = 103168
|
||||
|
||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
||||
# Ckpt folder format:
|
||||
# fs: 'local:/mnt/nfs/XXX'
|
||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
||||
|
||||
# boto3 Ckpt folder format:
|
||||
# import os
|
||||
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
|
||||
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
|
||||
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
|
||||
CHECKPOINT_EVERY = 50
|
||||
ckpt = dict(
|
||||
enable_save_ckpt=False, # enable ckpt save.
|
||||
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
|
||||
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
|
||||
load_ckpt_folder="local:llm_ckpts/",
|
||||
# 'load_ckpt_info' setting guide:
|
||||
# 1. the 'path' indicate ckpt path,
|
||||
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
|
||||
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
|
||||
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
|
||||
checkpoint_every=CHECKPOINT_EVERY,
|
||||
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
|
||||
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
|
||||
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
|
||||
)
|
||||
|
||||
TRAIN_FOLDER = "/path/to/dataset"
|
||||
VALID_FOLDER = "/path/to/dataset"
|
||||
data = dict(
|
||||
seq_len=SEQ_LEN,
|
||||
# micro_num means the number of micro_batch contained in one gradient update
|
||||
micro_num=4,
|
||||
# packed_length = micro_bsz * SEQ_LEN
|
||||
micro_bsz=2,
|
||||
# defaults to the value of micro_num
|
||||
valid_micro_num=4,
|
||||
# defaults to 0, means disable evaluate
|
||||
valid_every=50,
|
||||
pack_sample_into_one=False,
|
||||
total_steps=50000,
|
||||
skip_batches="",
|
||||
rampup_batch_size="",
|
||||
# Datasets with less than 50 rows will be discarded
|
||||
min_length=50,
|
||||
# train_folder=TRAIN_FOLDER,
|
||||
# valid_folder=VALID_FOLDER,
|
||||
empty_cache_and_diag_interval=10,
|
||||
diag_outlier_ratio=1.1,
|
||||
)
|
||||
|
||||
grad_scaler = dict(
|
||||
fp16=dict(
|
||||
# the initial loss scale, defaults to 2**16
|
||||
initial_scale=2**16,
|
||||
# the minimum loss scale, defaults to None
|
||||
min_scale=1,
|
||||
# the number of steps to increase loss scale when no overflow occurs
|
||||
growth_interval=1000,
|
||||
),
|
||||
# the multiplication factor for increasing loss scale, defaults to 2
|
||||
growth_factor=2,
|
||||
# the multiplication factor for decreasing loss scale, defaults to 0.5
|
||||
backoff_factor=0.5,
|
||||
# the maximum loss scale, defaults to None
|
||||
max_scale=2**24,
|
||||
# the number of overflows before decreasing loss scale, defaults to 2
|
||||
hysteresis=2,
|
||||
)
|
||||
|
||||
hybrid_zero_optimizer = dict(
|
||||
# Enable low_level_optimzer overlap_communication
|
||||
overlap_sync_grad=True,
|
||||
overlap_sync_param=True,
|
||||
# bucket size for nccl communication params
|
||||
reduce_bucket_size=512 * 1024 * 1024,
|
||||
# grad clipping
|
||||
clip_grad_norm=1.0,
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
label_smoothing=0,
|
||||
)
|
||||
|
||||
adam = dict(
|
||||
lr=1e-4,
|
||||
adam_beta1=0.9,
|
||||
adam_beta2=0.95,
|
||||
adam_beta2_c=0,
|
||||
adam_eps=1e-8,
|
||||
weight_decay=0.01,
|
||||
)
|
||||
|
||||
lr_scheduler = dict(
|
||||
total_steps=data["total_steps"],
|
||||
init_steps=0, # optimizer_warmup_step
|
||||
warmup_ratio=0.01,
|
||||
eta_min=1e-5,
|
||||
last_epoch=-1,
|
||||
)
|
||||
|
||||
beta2_scheduler = dict(
|
||||
init_beta2=adam["adam_beta2"],
|
||||
c=adam["adam_beta2_c"],
|
||||
cur_iter=-1,
|
||||
)
|
||||
|
||||
model = dict(
|
||||
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
|
||||
num_attention_heads=NUM_ATTENTION_HEAD,
|
||||
embed_split_hidden=True,
|
||||
vocab_size=VOCAB_SIZE,
|
||||
embed_grad_scale=1,
|
||||
parallel_output=True,
|
||||
hidden_size=HIDDEN_SIZE,
|
||||
num_layers=NUM_LAYER,
|
||||
mlp_ratio=MLP_RATIO,
|
||||
apply_post_layer_norm=False,
|
||||
dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
|
||||
norm_type="rmsnorm",
|
||||
layer_norm_epsilon=1e-5,
|
||||
use_flash_attn=True,
|
||||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
||||
)
|
||||
"""
|
||||
zero1 parallel:
|
||||
1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
|
||||
so parameters will be divided within the range of dp.
|
||||
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
||||
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
||||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
||||
pipeline parallel (dict):
|
||||
1. size: int, the size of pipeline parallel.
|
||||
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
|
||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
||||
"""
|
||||
parallel = dict(
|
||||
zero1=8,
|
||||
pipeline=dict(size=1, interleaved_overlap=True),
|
||||
sequence_parallel=False,
|
||||
)
|
||||
|
||||
cudnn_deterministic = False
|
||||
cudnn_benchmark = False
|
||||
|
||||
monitor = dict(
|
||||
# feishu alert configs
|
||||
alert=dict(
|
||||
enable_feishu_alert=DO_ALERT,
|
||||
feishu_alert_address=None, # feishu webhook to send alert message
|
||||
light_monitor_address=None, # light_monitor address to send heartbeat
|
||||
),
|
||||
)
|
||||
```
|
||||
接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。
|
||||
|
||||
#### 数据配置
|
||||
数据相关的关键参数配置及释义如下所示:
|
||||
|
|
|
@ -447,8 +447,8 @@ class CheckpointManager:
|
|||
|
||||
Args:
|
||||
ckpt_config (dict): model checkpoint config.
|
||||
model (nn.module): model obj
|
||||
optimizer (object): optimzier obj.
|
||||
model (nn.module): model obj.
|
||||
optimizer (object): optimizer obj.
|
||||
lr_scheduler (object): lr_scheduler obj.
|
||||
model_config (dict): model config.
|
||||
"""
|
||||
|
@ -712,7 +712,6 @@ now step_count is {train_state.step_count}",
|
|||
return dict(path=latest_ckpt, content=("all",), ckpt_type="internlm")
|
||||
|
||||
def try_resume_training(self, train_state: TrainState, current_time=""):
|
||||
|
||||
if self.load_ckpt_info is None or self.load_ckpt_info["path"] is None:
|
||||
if gpc.is_rank_for_log():
|
||||
logger.info(
|
||||
|
|
Loading…
Reference in New Issue