diff --git a/README-ja-JP.md b/README-ja-JP.md index 1cfce1f..395e248 100644 --- a/README-ja-JP.md +++ b/README-ja-JP.md @@ -16,6 +16,7 @@ [![license](./doc/imgs/license.svg)](./LICENSE) [![evaluation](./doc/imgs/compass_support.svg)](https://github.com/internLM/OpenCompass/) +[![Documentation Status](https://readthedocs.org/projects/internlm/badge/?version=latest)](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest) [📘使用法](./doc/en/usage.md) | [🛠️インストール](./doc/en/install.md) | diff --git a/README-zh-Hans.md b/README-zh-Hans.md index 5979c7f..6679939 100644 --- a/README-zh-Hans.md +++ b/README-zh-Hans.md @@ -16,6 +16,7 @@ [![license](./doc/imgs/license.svg)](https://github.com/open-mmlab/mmdetection/blob/main/LICENSE) [![evaluation](./doc/imgs/compass_support.svg)](https://github.com/internLM/OpenCompass/) +[![Documentation Status](https://readthedocs.org/projects/internlm/badge/?version=latest)](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest) [📘使用文档](./doc/usage.md) | [🛠️安装教程](./doc/install.md) | diff --git a/README.md b/README.md index d8711f2..0097aa8 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ [![license](./doc/imgs/license.svg)](./LICENSE) [![evaluation](./doc/imgs/compass_support.svg)](https://github.com/internLM/OpenCompass/) +[![Documentation Status](https://readthedocs.org/projects/internlm/badge/?version=latest)](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest) [📘Usage](./doc/en/usage.md) | [🛠️Installation](./doc/en/install.md) | diff --git a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po index f1d7a41..bd81fa5 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-07 10:56+0800\n" +"POT-Creation-Date: 2023-09-13 17:07+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -19,30 +19,33 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: ../../source/checkpoint.rst:2 09c8645fba264cdf9a80c4b62c2bb4d1 +#: ../../source/checkpoint.rst:2 msgid "模型保存" msgstr "Model Checkpointing" -#: ../../source/checkpoint.rst:4 8b158d34631045b1afdb4fb0169b3c71 +#: ../../source/checkpoint.rst:4 msgid "" "InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` " -"来管理模型保存。 其中,可以 使用 ``CheckpointManager.try_save_checkpoint(train_state)`` " -"来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。" +"来管理模型保存。其中,可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` " +"来保存指定 step 的模型状态。" msgstr "" -"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation, " -"we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports " -"automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit." +"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to " +"manage model checkpointing. In the implementation, we use " +"``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint " +"training states at specific steps. " -#: ../../source/checkpoint.rst:8 a023b5a6d15749bfaa51cf2da194bda1 +#: ../../source/checkpoint.rst:6 +msgid "InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。" +msgstr "InternLM supports automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit. " + +#: ../../source/checkpoint.rst:9 msgid "Checkpointing" msgstr "" -#: 938575c699d1426c87e0b3f589a85d50 #: internlm.utils.model_checkpoint.CheckpointManager:1 of msgid "StorageManagerContext" msgstr "" -#: 754d6881cd034c5ebaab0f3362dd14c2 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:1 of msgid "" "Exit signal detection function, if we write the exit step in the " @@ -51,34 +54,27 @@ msgid "" "quit." msgstr "" -#: 2169f9fb4a8b40bc9bf6093894fc7a5e 6a55d2b2b24a44c8b78b40f19f4d950b -#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training of +#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of msgid "参数" msgstr "" -#: 360a89b1591e4627ac432f4d75050354 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of msgid "返回" msgstr "" -#: 2426832f4a8a4c5481be1c940e0e7b50 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:9 of msgid "whether to quit." msgstr "" -#: 5f6842c261544a3c89f32d981b3ad755 #: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of msgid "返回类型" msgstr "" -#: 1392da84b6e645bcb8dab605e1231fdc #: internlm.utils.model_checkpoint.CheckpointManager.wait_async_upload_finish:1 #: of msgid "wait for all checkpoint uploads to be completed" msgstr "" -#: d1774593e9c94608b49b10504bfbc38b #: internlm.utils.model_checkpoint.CheckpointManager.query_latest_snapshot_step_boto3:1 #: of msgid "" @@ -86,38 +82,25 @@ msgid "" "found, None will return." msgstr "" -#: a3abbbd2bd574872892d908ab248e804 -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:1 of -msgid "Attempt to restore the training state of the last ckpt." -msgstr "" - -#: de021d1eb6d54955a2850c11c0191710 -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:3 of -msgid "lr_scheduler object." -msgstr "" - -#: 20be15854f2e420a9d96c86b5869bfa6 -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:5 of -msgid "optimizer object." -msgstr "" - -#: 68f69086c5054acc8aca15c8a764acc5 -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:7 of -msgid "learning rate." -msgstr "" - -#: 5d34d34a972d4abeab4bda3e49ee157b -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:9 of -msgid "traing states." -msgstr "" - -#: 82ebb67afaa748ecabc4cef598d7fc30 -#: internlm.utils.model_checkpoint.CheckpointManager.try_resume_training:11 of -msgid "traning dataloader object" -msgstr "" - -#: 0c95dfcd712749279daca78166bb4326 #: internlm.utils.model_checkpoint.CheckpointManager.save_checkpoint:1 of msgid "Save checkpoint to the given folder path." msgstr "" +#~ msgid "Attempt to restore the training state of the last ckpt." +#~ msgstr "" + +#~ msgid "lr_scheduler object." +#~ msgstr "" + +#~ msgid "optimizer object." +#~ msgstr "" + +#~ msgid "learning rate." +#~ msgstr "" + +#~ msgid "traing states." +#~ msgstr "" + +#~ msgid "traning dataloader object" +#~ msgstr "" + diff --git a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po index 345e06b..67f2451 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po @@ -37,8 +37,8 @@ msgstr "Start Training" #: ../../source/example/30B_demo.rst:166 24974384d5ab42e68266aeb67ae222ce msgid "完成以上训练配置后,可启动模型训练,以在 ``slurm`` 平台上为例,启动两节点 16GPU 的训练命令如下所示:" -msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. -The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs." +msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. " +"The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs." #: ../../source/example/30B_demo.rst:173 948ac71ed53848f9bad07f69d956c4bb msgid "训练结果" diff --git a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po index 904cd71..ccc6bca 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po @@ -37,8 +37,8 @@ msgstr "Start Training" #: ../../source/example/7B_demo.rst:164 9e7a864ae2e14d05b0681f16792e5278 msgid "完成以上训练配置后,可启动模型训练,以在 ``slurm`` 平台上为例,启动单节点 8GPU 的训练命令如下所示:" -msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. -The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs." +msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. " +"The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs." #: ../../source/example/7B_demo.rst:171 fdd053efb1854d46aabf6c0f279fe7fc msgid "训练结果" diff --git a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po index c3ea055..3303581 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-08 15:32+0800\n" +"POT-Creation-Date: 2023-09-14 12:23+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -23,24 +23,68 @@ msgstr "" msgid "训练构建" msgstr "Training Setup" -#: ../../source/initialize.rst:7 +#: ../../source/initialize.rst:4 +msgid "InternLM 的训练流程可以归纳为两个步骤:" +msgstr "The training process of InternLM can be summarized into two steps: " + +#: ../../source/initialize.rst:6 +msgid "初始化" +msgstr "Initialization" + +#: ../../source/initialize.rst:8 +msgid "初始化模型、优化器、数据加载器、Trainer,生成不同种类的进程组,为混合并行的迭代训练做准备。" +msgstr "" +"Initialize model, optimizer, dataloader, trainer, and create different " +"types of process groups to prepare for iterative steps of hybrid parallel training. " + +#: ../../source/initialize.rst:9 +msgid "初始化Logger、Checkpoint管理器、Monitor管理器、Profiler,对迭代训练的过程观察、预警、记录。" +msgstr "" +"Initialize logger, checkpoint manager, monitor manager, and profiler to " +"watch, alert, and record the iterative training steps. " + +#: ../../source/initialize.rst:11 +msgid "迭代训练" +msgstr "Iterative training steps" + +#: ../../source/initialize.rst:13 +msgid "根据配置文件定义的张量并行、流水线并行、数据并行的大小,加载训练引擎和调度器进行混合并行训练。" +msgstr "" +"Load the training engine and scheduler for hybrid parallel training " +"according to the configuration such as tensor parallel size, pipeline " +"parallel size, and data parallel size. " + +#: ../../source/initialize.rst:14 +msgid "在迭代训练中,调用 Trainer API 进行梯度置零,前向传播计算损失并反向传播,参数更新。" +msgstr "" +"In iterative training steps, the Trainer API is called to perform zero " +"gradients, forward-loss-backward, and parameter update." + +#: ../../source/initialize.rst:20 +msgid "InternLM训练流程图" +msgstr "InternLM training process" + +#: ../../source/initialize.rst:25 msgid "命令行参数解析" msgstr "Argument Parsing" -#: ../../source/initialize.rst:9 -#, fuzzy +#: ../../source/initialize.rst:27 msgid "" "InternLM 使用 `argparse `_" -" 库来向InternLM运行时提供命令行参数配置。用户可使用 " -"``internlm.initialize.get_default_parser()`` 来获取 InternLM " -"的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。" +" 库来向InternLM运行时提供命令行参数配置。" msgstr "" "InternLM uses the `argparse " "`_ library to supply " -"commandline configuration to the InternLM runtime. Use " -"``internlm.initialize.get_default_parser()`` to get InternLM's default " -"parser with some builtin arguments, users can add custom parameters to " -"this parser." +"commandline configuration to the InternLM runtime. " + +#: ../../source/initialize.rst:29 +msgid "" +"用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM " +"的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。" +msgstr "" +"Use ``internlm.initialize.get_default_parser()`` to get InternLM's " +"default parser with some builtin arguments, users can add custom " +"parameters to this parser." #: internlm.initialize.launch.get_default_parser:1 of msgid "" @@ -69,7 +113,7 @@ msgstr "" msgid "返回类型" msgstr "" -#: ../../source/initialize.rst:25 +#: ../../source/initialize.rst:45 msgid "模型初始化" msgstr "Model Initialization" @@ -81,26 +125,26 @@ msgstr "" msgid "The neural network model to be trained or evaluated." msgstr "" -#: ../../source/initialize.rst:29 +#: ../../source/initialize.rst:49 msgid "InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下:" msgstr "" "InternLM uses the field ``model_type`` and ``model`` in the config file " "to control model initialization process. An example model initialization " "configuratio" -#: ../../source/initialize.rst:57 +#: ../../source/initialize.rst:77 msgid "字段 ``model_type`` 指明了要初始化的模型类型" msgstr "" "The field ``model_type`` specifics the model type has been registered and" " to be initialized." -#: ../../source/initialize.rst:58 +#: ../../source/initialize.rst:78 msgid "字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置" msgstr "" "The parameters in field ``model`` specific the configuration settings " "during model initialization." -#: ../../source/initialize.rst:60 +#: ../../source/initialize.rst:80 msgid "" "值得注意的是,用户可以定义新的模型类型,并使用装饰器 ``@MODEL_INITIALIZER.register_module`` " "注册模型的初始化函数,其中 ``MODEL_INITIALIZER`` 是类 " @@ -112,7 +156,7 @@ msgstr "" " instantiated object of class ``internlm.util.registry.Registry``, the " "example is shown as follows." -#: ../../source/initialize.rst:72 +#: ../../source/initialize.rst:92 msgid "优化器初始化" msgstr "Optimizer Initialization" @@ -134,7 +178,7 @@ msgstr "" msgid "A tuple of (optimizer, beta2_scheduler, lr_scheduler)." msgstr "" -#: ../../source/initialize.rst:79 +#: ../../source/initialize.rst:99 msgid "数据加载器初始化" msgstr "Dataloader Initialization" @@ -162,7 +206,7 @@ msgstr "" msgid "A tuple of (train_dl, dataset_types)." msgstr "" -#: ../../source/initialize.rst:86 +#: ../../source/initialize.rst:106 msgid "Trainer 初始化" msgstr "Trainer Initialization" diff --git a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po index 71adf14..38858cd 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-08 15:32+0800\n" +"POT-Creation-Date: 2023-09-14 11:05+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -32,13 +32,13 @@ msgid "" "InternLM 使用 ``internlm.train.initialize_llm_profile()`` " "来收集和分析模型训练或推理期间的性能数据,如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler " "`_ ,输出的性能分析 trace 文件可以使用 " -"`tensorboard `_ 进行可视化。" +"`tensorboard `_ 进行可视化。" msgstr "" "InternLM uses ``internlm.train.initialize_llm_profile()`` to profile " "performance data, execution time duration and breakdown analysis of step " "time. The implementation is based on `torch.profiler " "`_ and output tracing " -"files can be visualized with `tensorboard `_." +"files can be visualized with `tensorboard `_." #: ../../source/profiler.rst:11 msgid "" @@ -53,11 +53,15 @@ msgstr "" #: ../../source/profiler.rst:13 msgid "实际运行生成的 ``Torch Profiler`` 目录结构如下:" -msgstr "The directory structure of ``Torch Profiler`` generated files is as follows:" +msgstr "" +"The directory structure of ``Torch Profiler`` generated files is as " +"follows:" #: ../../source/profiler.rst:22 msgid "其中, ``traces`` 可以通过 ``TensorBoard`` 可视化,运行命令" -msgstr "Among them, ``traces`` can be visualized through ``TensorBoard`` and run with the command" +msgstr "" +"Among them, ``traces`` can be visualized through ``TensorBoard`` and run " +"with the command" #: ../../source/profiler.rst:29 msgid "" @@ -66,7 +70,12 @@ msgid "" "tensorboard " "`_" -msgstr "In the opened ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` page, you can see the timeline of profiled operators and GPU kernels. For more usage, please refer to `torch profiler with tensorboard `_" +msgstr "" +"In the opened ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` page," +" you can see the timeline of profiled operators and GPU kernels. For more" +" usage, please refer to `torch profiler with tensorboard " +"`_" #: internlm.train.training_internlm.initialize_llm_profile:1 of msgid "Initialize and return the profiler context manager instance." diff --git a/doc/code-docs/locales/en/LC_MESSAGES/training.po b/doc/code-docs/locales/en/LC_MESSAGES/training.po index c9d9521..05c834f 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/training.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/training.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-07 10:56+0800\n" +"POT-Creation-Date: 2023-09-14 12:23+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -19,109 +19,144 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: ../../source/training.rst:2 6eafa5eb08e040039309a39cdb0f1bfe +#: ../../source/training.rst:2 msgid "训练 API" msgstr "Training API" -#: ../../source/training.rst:4 74d81f3d0ca54c839d4e80bd589aedb2 +#: ../../source/training.rst:4 msgid "" "InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` " "管理。在定义了训练引擎和调度器之后,我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。" msgstr "" -"InternLM training API is managed in ``internlm.core.trainer.Trainer``. After defining the " -"training engine and runtime scheduler, we can call training API to perform training, evaluation, " -"zero gradients and parameter update steps." +"InternLM training API is managed in ``internlm.core.trainer.Trainer``. " +"After defining the training engine and runtime scheduler, we can call " +"training API to perform training, evaluation, zero gradients and " +"parameter update steps." -#: ../../source/training.rst:6 0e0cfddbb2334d3da99d3289edf4161d +#: ../../source/training.rst:6 msgid "有关详细用法,请参阅 Trainer API 文档和示例。" -msgstr "For detailed usage, please refer to Trainer API documentation and examples." +msgstr "" +"For detailed usage, please refer to Trainer API documentation and " +"examples." -#: 7ea10280a8f1489984cb9994aa08976b internlm.core.trainer.Trainer:1 of +#: internlm.core.trainer.Trainer:1 of msgid "" "This is a class tending for easy deployments of users' training and " "evaluation instead of writing their own scripts." msgstr "" -#: 7969dca55840451193bffd3b071ab3b3 aff576168b59460491bb5da0ce41ea74 #: internlm.core.trainer.Trainer internlm.core.trainer.Trainer.execute_schedule #: of msgid "参数" msgstr "" -#: 59754d3e9ee8452a872bf397c01e0d8c internlm.core.trainer.Trainer:4 of +#: internlm.core.trainer.Trainer:4 of msgid "Engine responsible for the process function." msgstr "" -#: 2d18ff15256e48f98901c7a7e0cbbe35 internlm.core.trainer.Trainer:6 of +#: internlm.core.trainer.Trainer:6 of msgid "Runtime schedule. Defaults to None." msgstr "" -#: 76f4b3c7feba40eca3ee2b32559c53f5 internlm.core.trainer.Trainer.engine:1 of +#: internlm.core.trainer.Trainer.engine:1 of msgid "" "Returns the engine that responsible for managing the training and " "evaluation process." msgstr "" -#: c7eae2d4d06c4ef891e314902d80b7f3 internlm.core.trainer.Trainer.schedule:1 of +#: internlm.core.trainer.Trainer.schedule:1 of msgid "Returns the runtime scheduler." msgstr "" -#: cb495b21b3444881aec83803e92386d9 #: internlm.core.trainer.Trainer.uses_pipeline:1 of msgid "Returns whether the pipeline parallel is used or not." msgstr "" -#: 86b0b631189e46468281a397c5e97350 internlm.core.trainer.Trainer.train:1 of +#: internlm.core.trainer.Trainer.train:1 of msgid "Sets the model to training mode." msgstr "" -#: f997e13120ee4d8b9e45ea6698b3e2a6 internlm.core.trainer.Trainer.eval:1 of +#: internlm.core.trainer.Trainer.eval:1 of msgid "Sets the model to evaluation mode." msgstr "" -#: a8179e50312d47dcbe9de0433a65c2f7 internlm.core.trainer.Trainer.zero_grad:1 -#: of +#: internlm.core.trainer.Trainer.zero_grad:1 of msgid "Sets the gradient of all parameters in the model to zero." msgstr "" -#: f936136ef9e0452ca439b7c66dc8884b internlm.core.trainer.Trainer.step:1 of +#: internlm.core.trainer.Trainer.step:1 of msgid "Executes the parameter update step." msgstr "" -#: 250e2af89cfd432c84d228f9e03c174c #: internlm.core.trainer.Trainer.execute_schedule:1 of msgid "" "Runs the forward, loss computation, and backward for the model. Returns a" " tuple of (output, label, loss)." msgstr "" -#: 6ca7de83033b432792eb0d7935ea04da #: internlm.core.trainer.Trainer.execute_schedule:4 of msgid "The data iterator." msgstr "" -#: 6d3044e75b3149beba3c659e15607b79 #: internlm.core.trainer.Trainer.execute_schedule:6 of msgid "Additional keyword arguments." msgstr "" -#: 99d5a297d6414c30b432acf2566f0d3c #: internlm.core.trainer.Trainer.execute_schedule of msgid "返回" msgstr "" -#: b625ebf0cf874edba384456d33e740b4 #: internlm.core.trainer.Trainer.execute_schedule:8 of msgid "A tuple of (output, label, loss)." msgstr "" -#: 391cde57d2e2478d8f83a7ad270c2a65 #: internlm.core.trainer.Trainer.execute_schedule of msgid "返回类型" msgstr "" -#: d4c4fb0fbddb499786970509cf0c9e13 #: internlm.core.trainer.Trainer.execute_schedule:9 of msgid "Tuple[:class:`torch.Tensor`]" msgstr "" +#~ msgid "InternLM 的训练流程可以归纳为两个步骤:" +#~ msgstr "The training process of InternLM can be summarized into two steps: " + +#~ msgid "初始化" +#~ msgstr "Initialization" + +#~ msgid "初始化模型、优化器、数据加载器、Trainer,生成不同种类的进程组,为混合并行的迭代训练做准备。" +#~ msgstr "" +#~ "Initialize model, optimizer, dataloader, " +#~ "trainer, and create different types of" +#~ " process groups to prepare for " +#~ "iterative steps of hybrid parallel " +#~ "training. " + +#~ msgid "初始化Logger、Checkpoint管理器、Monitor管理器、Profiler,对迭代训练的过程观察、预警、记录。" +#~ msgstr "" +#~ "Initialize logger, checkpoint manager, monitor" +#~ " manager, and profiler to watch, " +#~ "alert, and record the iterative training" +#~ " steps. " + +#~ msgid "迭代训练" +#~ msgstr "Iterative training steps" + +#~ msgid "根据配置文件定义的张量并行、流水线并行、数据并行的大小,加载训练引擎和调度器进行混合并行训练。" +#~ msgstr "" +#~ "Load the training engine and scheduler" +#~ " for hybrid parallel training according " +#~ "to the configuration such as tensor " +#~ "parallel size, pipeline parallel size, " +#~ "and data parallel size. " + +#~ msgid "在迭代训练中,调用 Trainer API 进行梯度置零,前向传播计算损失并反向传播,参数更新。" +#~ msgstr "" +#~ "In iterative training steps, the Trainer" +#~ " API is called to perform zero " +#~ "gradients, forward-loss-backward, and " +#~ "parameter update." + +#~ msgid "InternLM训练流程图" +#~ msgstr "InternLM training process" + diff --git a/doc/code-docs/locales/en/LC_MESSAGES/usage.po b/doc/code-docs/locales/en/LC_MESSAGES/usage.po index 2297f8f..37e7cba 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-07 14:15+0800\n" +"POT-Creation-Date: 2023-09-11 14:25+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -19,11 +19,11 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" "Generated-By: Babel 2.12.1\n" -#: ../../../usage.md:2 a64aaaa1525e4e01b0ddcebc42c24bbd +#: ../../../usage.md:2 msgid "使用教程" msgstr "Quickstart Guide" -#: ../../../usage.md:4 f1b40737fb584d889b82c7f55b652977 +#: ../../../usage.md:4 msgid "" "启动一个 Demo " "模型训练,需要进行三项准备,**安装**,**数据集准备**和**模型训练配置**。接下来,首先会介绍数据准备相关的操作,再简要描述模型训练配置相关的内容。" @@ -33,21 +33,21 @@ msgstr "" "configuration**. In this guide, we will first cover the steps for dataset" " preparation and then briefly describe the model training configuration." -#: ../../../usage.md:6 b35abe307c2f4d23866fff828308ebf2 +#: ../../../usage.md:6 msgid "安装" msgstr "Installation" -#: ../../../usage.md:7 64a8c1f5f71c45519e636aa7edba10bc +#: ../../../usage.md:7 msgid "请参考[安装文档](./install.md)进行安装。" msgstr "" "Please refer to the [installation guide](./install.md) for instructions " "on how to install the necessary dependencies." -#: ../../../usage.md:9 bd96714d12ee415794dea5a4578bd8cd +#: ../../../usage.md:9 msgid "数据准备 (预训练)" msgstr "Dataset Preparation (Pre-training)" -#: ../../../usage.md:11 5a0b39fb9da94e96b87db40d1f231a0c +#: ../../../usage.md:11 msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型,可直接修改`tokernizer.py`中的模型参数路径。" msgstr "" "The dataset for the InternLM training task includes a series of `bin` and" @@ -58,7 +58,7 @@ msgstr "" "different model, you can directly modify the model parameter path in " "`tokenizer.py`." -#: ../../../usage.md:13 3cef8126b8784af48d81cc140322909e +#: ../../../usage.md:13 msgid "可以运行以下命令生成原始数据对应的`bin`和`meta`文件,其中参数`text_input_path`表示原始文本数据路径,目前支持`txt`、`json`和`jsonl`三种输入格式,`bin_output_path`表示生成的`bin`文件的保存路径。" msgstr "" "You can run the following command to generate `bin` and `meta` files " @@ -67,30 +67,30 @@ msgstr "" "`txt`, `json`, and `jsonl` formats, while `bin_output_path` represents " "the save path of the generated `bin` files." -#: ../../../usage.md:18 107ff2280da14cb6a27f4e9857186333 +#: ../../../usage.md:18 msgid "下面是一个数据处理的例子:" msgstr "Here is an example of data processing:" -#: ../../../usage.md:20 c11a9860263c4e2288a561f3435fa706 +#: ../../../usage.md:20 msgid "给定一个包含原始数据集的文件`raw_data.txt`,原始数据集如下所示:" msgstr "" "Given a file `raw_data.txt` containing the raw dataset, the raw dataset " "is shown below:" -#: ../../../usage.md:27 4012599b42ab47bd979d2a0b79ca1147 +#: ../../../usage.md:27 msgid "可以通过运行以下命令来生成`bin`和`meta`文件:" msgstr "" "You can generate the `bin` and `meta` files by running the following " "command:" -#: ../../../usage.md:32 cca91b6cf53a4082932dd34ea4b7f954 +#: ../../../usage.md:32 msgid "需要注意的是,生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这六个目录下,以区分数据集的类型。" msgstr "" "It should be noted that the generated `bin` files need to be saved in one" " of the following directories: `cn`, `en`, `code`, `ja`, `ar`, or " "`kaoshi`, depending on the type of dataset." -#: ../../../usage.md:34 417312ca1e35479e811953f777e3565a +#: ../../../usage.md:34 msgid "其中,`cn`表示中文数据集;`en`表示英文数据集;`code`表示代码数据集;`ja`表示日语数据集;`ar`表示阿拉伯语数据集;`kaoshi`表示考试数据集。" msgstr "" "Here, `cn` represents the Chinese dataset, `en` represents the English " @@ -98,22 +98,22 @@ msgstr "" " dataset, `ar` represents the Arabic dataset, and `kaoshi` represents the" " exam dataset." -#: ../../../usage.md:36 79c21f8e89b34499ba4e25e20593ec28 +#: ../../../usage.md:36 msgid "生成的bin文件的格式如下:" msgstr "The format of the generated `bin` files is as follows:" -#: ../../../usage.md:42 26388d996c4e4116bc216be9bc007f62 +#: ../../../usage.md:42 msgid "`bin`文件中的每一行均对应原始数据集中的每一个句子,表示每个句子的`token`(下文将用sequence指定)。" msgstr "" "Each line in the `bin` file corresponds to each sentence in the original " "dataset, representing the tokens of each sentence (referred to as " "sequence below)." -#: ../../../usage.md:44 b39148a85ee64a349975d26282fbe59b +#: ../../../usage.md:44 msgid "生成的`meta`文件的格式如下:" msgstr "The format of the generated `meta` file is as follows:" -#: ../../../usage.md:48 175a6007197a40568535f945672e5df2 +#: ../../../usage.md:48 msgid "" "在`meta`文件中,每个元组对应着`bin`文件中每一个`sequence`的元信息。其中,元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting" " index`,第二个元素表示每个`sequence`中有多少个`tokens`。" @@ -123,7 +123,7 @@ msgstr "" "index` of each `sequence` among all `sequences`, and the second element " "indicates the number of `tokens` for each `sequence`." -#: ../../../usage.md:50 46874a3de3924837979f9949f1237e39 +#: ../../../usage.md:50 msgid "" "例如,对于第一个`sequence`,`starting index`为 0,有 11 " "个`tokens`;对于第二个`sequence`,由于第一个`sequence`转换为`string`后的长度为`89`,因此它的`starting" @@ -132,17 +132,17 @@ msgstr "" "For example, the first `sequence` starts at index 0 and has 16 `tokens`. " "The second `sequence` starts at index 110 and has 24 `tokens`." -#: ../../../usage.md:52 25ea049fa411408b8856e7aa657835ab +#: ../../../usage.md:52 msgid "`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致,此处不再赘叙。" msgstr "" "The `bin` and `meta` file formats for `json` and `jsonl` type files are " "the same as for `txt`, so we won't go over them here." -#: ../../../usage.md:54 bc52f959cb57494483a181e843014ed1 +#: ../../../usage.md:54 msgid "数据准备 (微调)" msgstr "Data Preparation (Fine-tuning)" -#: ../../../usage.md:56 73c74620c2994486acc747ba0c7f0b46 +#: ../../../usage.md:56 msgid "" "微调任务的数据集格式与预训练任务保持一致,生成的数据格式为一系列的`bin`和`meta`文件。以下以 Alpaca " "数据集为例,介绍微调的数据准备流程。" @@ -152,7 +152,7 @@ msgstr "" "the Alpaca dataset as an example to explain the data preparation process " "for fine-tuning." -#: ../../../usage.md:58 75f0e22d10ca413389ec8b947ae6141f +#: ../../../usage.md:58 msgid "" "下载 [Alpaca 数据集](https://github.com/tatsu-" "lab/stanford_alpaca/blob/main/alpaca_data.json)" @@ -160,87 +160,87 @@ msgstr "" "Download the [Alpaca dataset](https://github.com/tatsu-" "lab/stanford_alpaca/blob/main/alpaca_data.json)." -#: ../../../usage.md:60 667606fcea454af48353a5b40f82fc46 +#: ../../../usage.md:60 msgid "对 Alpaca 数据进行 tokenize,使用以下命令" msgstr "Tokenize the Alpaca dataset using the following command:" -#: ../../../usage.md:66 60283b9237c8462ea37288b8ece79081 +#: ../../../usage.md:66 msgid "建议用户参考 alpaca_tokenizer.py 编写新的脚本对自己的数据集进行 tokenize" msgstr "" "It is recommended that users refer to alpaca_tokenizer.py to write new " "scripts to tokenize their own datasets" -#: ../../../usage.md:68 cdf45a4de9874e9fb65f7104dcee3c61 +#: ../../../usage.md:68 msgid "训练配置" msgstr "Training Configuration" -#: ../../../usage.md:70 7c42ebc23246450cbc1270e1461b16f6 -msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例,介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。" +#: ../../../usage.md:70 +#, fuzzy +msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例:" msgstr "" "Taking the configuration file `configs/7B_sft.py` for the 7B demo as an " -"example, let's discuss the data, model, and parallel configurations " +"example," + +#: ../../../usage.md:237 +msgid "接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。" +msgstr "" +"let's discuss the data, model, parallel and monitoring configurations " "required to start a model training." -#: ../../../usage.md:72 247cfe98a7f44c2293aa2e2351f1ea69 +#: ../../../usage.md:239 msgid "数据配置" msgstr "Data Configuration" -#: ../../../usage.md:73 31327e7dce5848778db5361b3fbded1c +#: ../../../usage.md:240 msgid "数据相关的关键参数配置及释义如下所示:" msgstr "Here are the key parameters and their explanations for data configuration:" -#: ../../../usage.md:88 4d2608136fef4141bd6e47f78b8591b2 +#: ../../../usage.md:255 msgid "![pack_into_one](./imgs/pack_into_one.png)" msgstr "" -#: ../../../usage.md:88 c5acb028f2694712b2af788a864d5927 +#: ../../../usage.md:255 msgid "pack_into_one" msgstr "" -#: ../../../usage.md:91 db6b9ce8e8294952845893dd7aad098f +#: ../../../usage.md:258 msgid "目前支持传入数据集文件路径`train_folder`,且要求文件格式如下:" msgstr "" "Currently, it supports passing the dataset file path `train_folder`, and " "the file format is required to be as follows:" -#: ../../../usage.md:98 f22536fc3dfa4552a103a7cb57a20f92 +#: ../../../usage.md:265 msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。" msgstr "" "For detailed information about the dataset, please refer to the \"Data " "Preparation\" section." -#: ../../../usage.md:100 bc4f0b06e9c24730a7a831b7aca417e2 +#: ../../../usage.md:267 msgid "模型配置" msgstr "Model Configuration" -#: ../../../usage.md:102 ecf278a0a851496fae2e49c436e59368 +#: ../../../usage.md:269 msgid "如果在启动训练时要加载模型 `checkpoint`,可进行如下相关配置:" msgstr "" "If you want to load a model checkpoint when starting the training, you " "can configure it as follows:" -#: ../../../usage.md:115 38244aba74294067a4019d0777621746 +#: ../../../usage.md:282 msgid "注意:" msgstr "Note:" -#: ../../../usage.md:116 19d1eb0a797f4bd9a702a00e525d7753 -msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置" -msgstr "" -"`load_model_only_folder` and `load_ckpt_folder` cannot be set at the same" -" time." - -#: ../../../usage.md:117 3ea27a1f6be044a3959890be69311b24 +#: ../../../usage.md:283 msgid "路径若以 `local:` 为前缀,则存储在本地文件系统;若以 `boto3:` 为前缀,则存储在远程 oss 上" msgstr "" "If the path starts with `local:`, it means the file is stored in the " "local file system. If it starts with `boto3:`, it means the file is " "stored in the remote OSS." -#: ../../../usage.md:119 1d6381b4cfff41d8bdd5347e8a135869 +#: ../../../usage.md:285 msgid "模型相关关键参数配置如下所示:" msgstr "The configuration for the model is as follows:" -#: ../../../usage.md:143 1026791c9f054576857ef1930db6b167 +#: ../../../usage.md:309 msgid "注意:用户可自定义模型类型名和模型结构,并配置相对应的模型参数。通过`utils/registry.py`下的`MODEL_INITIALIZER`对象进行模型初始化函数接口注册,在训练主函数`train.py`中初始化模型时,可通过`model_type`配置获取指定的模型初始化接口函数。" msgstr "" "Note: Users can customize the model type name and model structure, and " @@ -251,7 +251,7 @@ msgstr "" "interface function can be obtained through the `model_type` " "configuration." -#: ../../../usage.md:145 34823bcbe7754190bc9747758c1aad0c +#: ../../../usage.md:311 msgid "" "*如果基于 InternLM 7B继续训练,可以参考 " "[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 " @@ -261,79 +261,76 @@ msgstr "" "OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-" "zoo) to download weights*." -#: ../../../usage.md:147 4cabc928f8884cd38a6bb683b3bfade3 +#: ../../../usage.md:313 msgid "并行配置" msgstr "Parallel Configuration" -#: ../../../usage.md:149 f97ade07340340959345e73567bae793 +#: ../../../usage.md:315 msgid "训练并行配置样例如下:" msgstr "Training parallel configuration example:" -#: ../../../usage.md:158 87fb5a4e4a4047ee8a9b8bb43915636d +#: ../../../usage.md:324 msgid "zero1:zero 并行策略,分如下三种情况,默认值为 -1" msgstr "" "zero1: zero parallel strategy, divided into the following three cases, " "default value is -1" -#: ../../../usage.md:159 58dc08e2c52e4aaba99b4fbb6cf2e8b4 -#, fuzzy +#: ../../../usage.md:325 msgid "当`zero1 <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配" msgstr "" "When `zero1 <= 0`, the size of the zero1 process group is equal to the " "size of the data parallel process group, so the optimizer state " "parameters will be split within the data parallel range." -#: ../../../usage.md:160 67e2ebd795d840b29fd1d684a068e90d -#, fuzzy +#: ../../../usage.md:326 msgid "当`zero1 == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数" msgstr "" -"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain " -"the complete optimizer state parameters." +"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain" +" the complete optimizer state parameters." -#: ../../../usage.md:161 7caedfc943514b9b83090b858ef6d163 -#, fuzzy +#: ../../../usage.md:327 msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集" msgstr "" -"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process" -" group is a subset of the data parallel process group." +"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 " +"process group is a subset of the data parallel process group." -#: ../../../usage.md:162 b38d3a1f72d543c6a44728fb6babea6b +#: ../../../usage.md:328 msgid "tensor:张量并行大小,通常是每个节点的 GPU 数量,默认值为 1" msgstr "" "tensor: tensor parallel size, usually the number of GPUs per node, " "default is 1" -#: ../../../usage.md:163 237ac76df68f4a999396dad37c5495c3 +#: ../../../usage.md:329 msgid "pipeline:流水线并行策略" msgstr "pipeline: pipeline parallel strategy" -#: ../../../usage.md:164 c8c38f6ab2ea432eb9ebbb62618ca33e +#: ../../../usage.md:330 msgid "size:流水线并行大小,默认值为 1" msgstr "size: pipeline parallel size, the default value is 1" -#: ../../../usage.md:165 b9158818e72e49acbdd52ad317cb80df +#: ../../../usage.md:331 msgid "interleaved_overlap:bool 类型,交错式调度时,开启或关闭通信优化,默认值为关闭" msgstr "" "interleaved_overlap: bool type, when interleaved scheduling, enable or " "disable communication optimization, the default value is False" -#: ../../../usage.md:166 28e4d48661ff4f80aff788fdda604433 +#: ../../../usage.md:332 msgid "sequence_parallel:是否开启序列化并行,默认值为 False" msgstr "" "sequence_parallel: Whether to enable sequence parallelism, the default " "value is False" -#: ../../../usage.md:168 27528ab826824d2280506460e1f2f7bd +#: ../../../usage.md:334 msgid "注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`" msgstr "" "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size" " / Tensor parallel size`" -#: ../../../usage.md:170 5a7af23cec604f1d9096a5ab81993c87 +#: ../../../usage.md:336 msgid "启动训练" msgstr "Start Training" -#: ../../../usage.md:172 795e51542ed84cea83b63c5233bb88bc +#: ../../../usage.md:338 msgid "完成了以上数据集准备和相关训练配置后,可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例,介绍训练启动方式。" msgstr "" "After completing the data preparation and relevant training " @@ -341,25 +338,30 @@ msgstr "" "following examples demonstrate how to start the training in both slurm " "and torch environments." -#: ../../../usage.md:174 96402cbe443044c0a0a1695c9847140b +#: ../../../usage.md:340 msgid "若在 slurm 上启动分布式运行环境,多节点 16 卡的运行命令如下所示:" msgstr "" "If you want to start distributed training on slurm with 16 GPUs across " "multiple nodes, use the following command:" -#: ../../../usage.md:179 c569e60401a6471eb9af2473acc4d5a6 +#: ../../../usage.md:345 msgid "若在 torch 上启动分布式运行环境,单节点 8 卡的运行命令如下所示:" msgstr "" "If you want to start distributed training on torch with 8 GPUs on a " "single node, use the following command:" -#: ../../../usage.md:184 a045a060d0734aab9d894aed553cef34 +#: ../../../usage.md:350 msgid "运行结果" msgstr "Training Results" -#: ../../../usage.md:186 c68e8dfa259647c7a6e6e0c0446b0b18 +#: ../../../usage.md:352 msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例,训练结果日志展示如下:" msgstr "" "Taking the configuration of the demo training on a single machine with 8 " "GPUs on slurm as an example, the training result log is shown below:" +#~ msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置" +#~ msgstr "" +#~ "`load_model_only_folder` and `load_ckpt_folder` " +#~ "cannot be set at the same time." + diff --git a/doc/code-docs/source/checkpoint.rst b/doc/code-docs/source/checkpoint.rst index a192469..ee4f037 100644 --- a/doc/code-docs/source/checkpoint.rst +++ b/doc/code-docs/source/checkpoint.rst @@ -1,8 +1,9 @@ 模型保存 =================== -InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。 其中,可以 -使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。 +InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。其中,可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。 + +InternLM支持启动时自动加载最新的模型备份,并在接收信号退出训练时自动进行模型备份。 Checkpointing ------------- diff --git a/doc/code-docs/source/conf.py b/doc/code-docs/source/conf.py index a41f850..c752047 100644 --- a/doc/code-docs/source/conf.py +++ b/doc/code-docs/source/conf.py @@ -72,14 +72,14 @@ exclude_patterns = [] # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = "sphinx_rtd_theme" -html_static_path = ["_static"] +html_static_path = [] # GitHub integration html_context = { "display_github": True, "github_user": "InternLM", "github_repo": "InternLM", - "github_version": "master", + "github_version": "main", "conf_py_path": "/doc/code-docs/source/", } diff --git a/doc/code-docs/source/initialize.rst b/doc/code-docs/source/initialize.rst index 37ee66e..e94f4f6 100644 --- a/doc/code-docs/source/initialize.rst +++ b/doc/code-docs/source/initialize.rst @@ -1,12 +1,32 @@ 训练构建 ============== +InternLM 的训练流程可以归纳为两个步骤: + +1. 初始化 + + * 初始化模型、优化器、数据加载器、Trainer,生成不同种类的进程组,为混合并行的迭代训练做准备。 + * 初始化Logger、Checkpoint管理器、Monitor管理器、Profiler,对迭代训练的过程观察、预警、记录。 + +2. 迭代训练 + + * 根据配置文件定义的张量并行、流水线并行、数据并行的大小,加载训练引擎和调度器进行混合并行训练。 + * 在迭代训练中,调用 Trainer API 进行梯度置零,前向传播计算损失并反向传播,参数更新。 + +.. figure:: ../../imgs/hybrid_parallel_training.png + :scale: 45% + :class: with-border + + InternLM训练流程图 + .. _InternLM-args: 命令行参数解析 ---------------- -InternLM 使用 `argparse `_ 库来向InternLM运行时提供命令行参数配置。用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。 +InternLM 使用 `argparse `_ 库来向InternLM运行时提供命令行参数配置。 + +用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器,其中包含一些内置参数,用户可以向此解析器添加自定义参数。 .. code-block:: python diff --git a/doc/code-docs/source/profiler.rst b/doc/code-docs/source/profiler.rst index 7ff42cb..0163ebe 100644 --- a/doc/code-docs/source/profiler.rst +++ b/doc/code-docs/source/profiler.rst @@ -6,7 +6,7 @@ Torch Profiler ----------------- -InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据,如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler `_ ,输出的性能分析 trace 文件可以使用 `tensorboard `_ 进行可视化。 +InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据,如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler `_ ,输出的性能分析 trace 文件可以使用 `tensorboard `_ 进行可视化。 用户如果想使用这个 torch 性能分析工具,需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch 性能分析后,用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` 文件夹中看到性能分析结果。 diff --git a/doc/code-docs/source/qa.rst b/doc/code-docs/source/qa.rst index e1b990a..3912bb3 100644 --- a/doc/code-docs/source/qa.rst +++ b/doc/code-docs/source/qa.rst @@ -1,2 +1,2 @@ 问&答 -==== \ No newline at end of file +===== \ No newline at end of file diff --git a/doc/en/usage.md b/doc/en/usage.md index d115fb1..864ead6 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -74,7 +74,173 @@ It is recommended that users refer to alpaca_tokenizer.py to write new scripts t ### Training Configuration -Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, and parallel configurations required to start a model training. +Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, parallel and monitoring configurations required to start a model training. +```python +JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 2048 +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=50000, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel: + 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. + 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. +tensor parallel: tensor parallel size, usually the number of GPUs per node. +""" +parallel = dict( + zero1=8, + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=False, +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) +``` #### Data Configuration Here are the key parameters and their explanations for data configuration: diff --git a/doc/imgs/hybrid_parallel_training.png b/doc/imgs/hybrid_parallel_training.png new file mode 100644 index 0000000..33e4ff9 Binary files /dev/null and b/doc/imgs/hybrid_parallel_training.png differ diff --git a/doc/usage.md b/doc/usage.md index 1b98c10..82c20e0 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -66,7 +66,174 @@ python tools/alpaca_tokenizer.py /path/to/alpaca_dataset /path/to/output_dataset ### 训练配置 -以 7B Demo 的配置文件`configs/7B_sft.py`为例,介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。 +以 7B Demo 的配置文件`configs/7B_sft.py`为例: +```python +JOB_NAME = "7b_train" +DO_ALERT = False + +SEQ_LEN = 2048 +HIDDEN_SIZE = 4096 +NUM_ATTENTION_HEAD = 32 +MLP_RATIO = 8 / 3 +NUM_LAYER = 32 +VOCAB_SIZE = 103168 + +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" +# Ckpt folder format: +# fs: 'local:/mnt/nfs/XXX' +SAVE_CKPT_FOLDER = "local:llm_ckpts" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" + +# boto3 Ckpt folder format: +# import os +# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint +# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" +# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" +CHECKPOINT_EVERY = 50 +ckpt = dict( + enable_save_ckpt=False, # enable ckpt save. + save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. + # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), + load_ckpt_folder="local:llm_ckpts/", + # 'load_ckpt_info' setting guide: + # 1. the 'path' indicate ckpt path, + # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. + load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + checkpoint_every=CHECKPOINT_EVERY, + async_upload=True, # async ckpt upload. (only work for boto3 ckpt) + async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. + oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. +) + +TRAIN_FOLDER = "/path/to/dataset" +VALID_FOLDER = "/path/to/dataset" +data = dict( + seq_len=SEQ_LEN, + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=2, + # defaults to the value of micro_num + valid_micro_num=4, + # defaults to 0, means disable evaluate + valid_every=50, + pack_sample_into_one=False, + total_steps=50000, + skip_batches="", + rampup_batch_size="", + # Datasets with less than 50 rows will be discarded + min_length=50, + # train_folder=TRAIN_FOLDER, + # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, +) + +grad_scaler = dict( + fp16=dict( + # the initial loss scale, defaults to 2**16 + initial_scale=2**16, + # the minimum loss scale, defaults to None + min_scale=1, + # the number of steps to increase loss scale when no overflow occurs + growth_interval=1000, + ), + # the multiplication factor for increasing loss scale, defaults to 2 + growth_factor=2, + # the multiplication factor for decreasing loss scale, defaults to 0.5 + backoff_factor=0.5, + # the maximum loss scale, defaults to None + max_scale=2**24, + # the number of overflows before decreasing loss scale, defaults to 2 + hysteresis=2, +) + +hybrid_zero_optimizer = dict( + # Enable low_level_optimzer overlap_communication + overlap_sync_grad=True, + overlap_sync_param=True, + # bucket size for nccl communication params + reduce_bucket_size=512 * 1024 * 1024, + # grad clipping + clip_grad_norm=1.0, +) + +loss = dict( + label_smoothing=0, +) + +adam = dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, +) + +lr_scheduler = dict( + total_steps=data["total_steps"], + init_steps=0, # optimizer_warmup_step + warmup_ratio=0.01, + eta_min=1e-5, + last_epoch=-1, +) + +beta2_scheduler = dict( + init_beta2=adam["adam_beta2"], + c=adam["adam_beta2_c"], + cur_iter=-1, +) + +model = dict( + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] + num_attention_heads=NUM_ATTENTION_HEAD, + embed_split_hidden=True, + vocab_size=VOCAB_SIZE, + embed_grad_scale=1, + parallel_output=True, + hidden_size=HIDDEN_SIZE, + num_layers=NUM_LAYER, + mlp_ratio=MLP_RATIO, + apply_post_layer_norm=False, + dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. +) +""" +zero1 parallel: + 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, + so parameters will be divided within the range of dp. + 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. + 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. + For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. +pipeline parallel (dict): + 1. size: int, the size of pipeline parallel. + 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. +tensor parallel: tensor parallel size, usually the number of GPUs per node. +""" +parallel = dict( + zero1=8, + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=False, +) + +cudnn_deterministic = False +cudnn_benchmark = False + +monitor = dict( + # feishu alert configs + alert=dict( + enable_feishu_alert=DO_ALERT, + feishu_alert_address=None, # feishu webhook to send alert message + light_monitor_address=None, # light_monitor address to send heartbeat + ), +) +``` +接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。 #### 数据配置 数据相关的关键参数配置及释义如下所示: diff --git a/internlm/core/trainer.py b/internlm/core/trainer.py index a96809e..7954bc6 100644 --- a/internlm/core/trainer.py +++ b/internlm/core/trainer.py @@ -4,6 +4,7 @@ # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine import json +from collections import deque from typing import Iterable, Optional from internlm.core.engine import Engine @@ -58,6 +59,24 @@ class TrainState: if batch_sampler: self.init_batch_sampler(batch_sampler) + # tgs statistic + self.tgs_statistic = { + "sum_step": 0, + "sum_tg": 0, + "sum_time": 0, + "sum_last_tg_10": 0, + "sum_last_time_10": 0, + "sum_last_tg_50": 0, + "sum_last_time_50": 0, + "SMA_tg_50": 0, + "SMA_time_50": 0, + "SMA_tg_50_list": deque(), + "SMA_time_50_list": deque(), + "sum_tgs": 0, + "last_tgs_10": 0, + "last_tgs_50": 0, + } + def init_batch_sampler(self, batch_sampler): """ Args: diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 30cc9e0..4181f20 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -379,9 +379,52 @@ def record_current_batch_training_metrics( max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]]) max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]]) min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]]) - - tk_per_gpu = 0 + time_cost = time.time() - start_time tk_per_gpu = round( + num_tokens_in_batch * gpc.get_world_size(ParallelMode.DATA) / gpc.get_world_size(ParallelMode.GLOBAL), + 4, + ) + tgs_statistic = train_state.tgs_statistic + tgs_statistic["sum_step"] += 1 + tgs_statistic["sum_tg"] += tk_per_gpu + tgs_statistic["sum_time"] += time_cost + tgs_statistic["sum_last_tg_10"] += tk_per_gpu + tgs_statistic["sum_last_time_10"] += time_cost + tgs_statistic["sum_last_tg_50"] += tk_per_gpu + tgs_statistic["sum_last_time_50"] += time_cost + tgs_statistic["SMA_tg_50"] += tk_per_gpu + tgs_statistic["SMA_time_50"] += time_cost + tgs_statistic["SMA_tg_50_list"].append(tk_per_gpu) + tgs_statistic["SMA_time_50_list"].append(time_cost) + if tgs_statistic["sum_step"] > 50: + tgs_statistic["SMA_tg_50"] -= tgs_statistic["SMA_tg_50_list"][0] + tgs_statistic["SMA_time_50"] -= tgs_statistic["SMA_time_50_list"][0] + tgs_statistic["SMA_tg_50_list"].popleft() + tgs_statistic["SMA_time_50_list"].popleft() + + last_tgs_1 = round(tk_per_gpu / time_cost, 2) + tgs_statistic["sum_tgs"] += last_tgs_1 + + if tgs_statistic["sum_step"] % 10 == 0: + tgs_statistic["last_tgs_10"] = round(tgs_statistic["sum_last_tg_10"] / tgs_statistic["sum_last_time_10"], 2) + tgs_statistic["sum_last_tg_10"] = 0 + tgs_statistic["sum_last_time_10"] = 0 + + if tgs_statistic["sum_step"] % 50 == 0: + tgs_statistic["last_tgs_50"] = round(tgs_statistic["sum_last_tg_50"] / tgs_statistic["sum_last_time_50"], 2) + tgs_statistic["sum_last_tg_50"] = 0 + tgs_statistic["sum_last_time_50"] = 0 + + last_tgs_10 = tgs_statistic["last_tgs_10"] + last_tgs_50 = tgs_statistic["last_tgs_50"] + + tgs_all = round(tgs_statistic["sum_tg"] / tgs_statistic["sum_time"], 2) + tgs_avg = round(tgs_statistic["sum_tgs"] / tgs_statistic["sum_step"], 2) + tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2) + + tflops = get_tflops_func((time.time() - start_time)) + + tgs_origin = round( num_tokens_in_batch * gpc.get_world_size(ParallelMode.DATA) / gpc.get_world_size(ParallelMode.GLOBAL) @@ -389,14 +432,18 @@ def record_current_batch_training_metrics( 2, ) - tflops = get_tflops_func((time.time() - start_time)) - infos = { "tflops": tflops, "step": batch_count, "loss": loss.item() - moe_loss.item(), "moe_loss": moe_loss.item(), - "tgs (tokens/gpu/second)": tk_per_gpu, + "tgs (tokens/gpu/second)": tgs_origin, + "tgs/last_tgs_1": last_tgs_1, + "tgs/tgs_all": tgs_all, + "tgs/tgs_avg": tgs_avg, + "tgs/tgs_SMA": tgs_SMA, + "tgs/last_tgs_10": last_tgs_10, + "tgs/last_tgs_50": last_tgs_50, "lr": lr, "loss_scale": scaler, "grad_norm": grad_norm, @@ -436,7 +483,7 @@ def record_current_batch_training_metrics( "num_consumed_tokens": train_state.num_consumed_tokens, "loss": loss.item() - moe_loss.item(), "flops": tflops, - "tgs": tk_per_gpu, + "tgs": last_tgs_1, "acc": acc_perplex["acc"], "perplexity": acc_perplex["perplexity"], "fwd_bwd_time": fwd_bwd_time, diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 9307138..ee8481c 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -541,8 +541,8 @@ class CheckpointManager: Args: ckpt_config (dict): model checkpoint config. - model (nn.module): model obj - optimizer (object): optimzier obj. + model (nn.module): model obj. + optimizer (object): optimizer obj. lr_scheduler (object): lr_scheduler obj. model_config (dict): model config. """ @@ -806,7 +806,6 @@ now step_count is {train_state.step_count}", return dict(path=latest_ckpt, content=("all",), ckpt_type="internlm") def try_resume_training(self, train_state: TrainState, current_time=""): - if self.load_ckpt_info is None or self.load_ckpt_info["path"] is None: if gpc.is_rank_for_log(): logger.info( diff --git a/tests/test_model/test_embedding.py b/tests/test_model/test_embedding.py new file mode 100644 index 0000000..324ca2b --- /dev/null +++ b/tests/test_model/test_embedding.py @@ -0,0 +1,65 @@ +import multiprocessing as mp + +import pytest +import torch + +from internlm.model.embedding import Embedding1D +from tests.test_model.test_model_internlm import build_environment, seed_all + + +def check_embedding(args): + # init + rank, world_size = args + device = torch.device("cuda") + build_environment(rank, world_size) + rtol, atol = (1e-3, 5e-3) + vocab_size = 4 + hidden_size = 2 + + # fix seed + seed_all(1024) + + # define embedding + embedding = Embedding1D( + num_embeddings=vocab_size, + embedding_dim=hidden_size, + padding_idx=None, + ) + + embedding.weight.data.copy_(torch.randn(vocab_size, hidden_size)) + embedding = embedding.to(device) + + # create input + input_ids = torch.tensor([[0, 2], [1, 3]]).to(device) + result = embedding(input_ids) + + standard_list = [[[-1.4837, 0.2671], [0.6002, -0.5496]], [[-1.8337, -0.1047], [1.0391, 0.2261]]] + standard_result = torch.tensor(standard_list).to(device) + + # check output + assert torch.allclose(result, standard_result, rtol=rtol, atol=atol, equal_nan=True) + + loss = torch.randn_like(result) + + # backward + result.backward(loss) + + grad = embedding.weight.grad + standard_glist = [[-0.4461, 0.5602], [0.4353, 1.2988], [-0.0625, -1.3609], [0.9595, -0.1144]] + standard_grad = torch.tensor(standard_glist).to(device) + + # check grad + assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol, equal_nan=True) + + +@pytest.mark.embedding +def test_embedding(): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map(check_embedding, [[rank, 8] for rank in range(8)]) + pool.close() + pool.join() + + +if __name__ == "__main__": + pytest.main(["-s", "-q", "test_embedding.py"]) diff --git a/tests/test_model/test_model_internlm.py b/tests/test_model/test_model_internlm.py new file mode 100644 index 0000000..fb9c678 --- /dev/null +++ b/tests/test_model/test_model_internlm.py @@ -0,0 +1,379 @@ +import multiprocessing as mp +import random + +import numpy as np +import pytest +import torch +from torch import nn + +import internlm +from internlm.core.context import ParallelMode +from internlm.core.context.parallel_context import Config +from internlm.core.context.parallel_context import global_context as gpc +from internlm.model.linear import RewardModelLinear, ScaleColumnParallelLinear +from internlm.model.modeling_internlm import PackedFlashBaseLayer1D +from internlm.model.utils import gather_forward_split_backward + +config = Config( + dict( + parallel=dict(zero1=1, pipeline=dict(size=1, interleaved_overlap=False), sequence_parallel=False, tensor=1), + model_type="INTERNLM", + data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999), + model=dict( + checkpoint=False, + num_attention_heads=2, + embed_split_hidden=True, + vocab_size=103168, + embed_grad_scale=1, + parallel_output=True, + hidden_size=1024, + num_layers=2, + mlp_ratio=1, + apply_post_layer_norm=False, + dtype=torch.bfloat16, + norm_type="rmsnorm", + layer_norm_epsilon=1e-5, + use_flash_attn=True, + num_chunks=1, + ), + resume_tb_folder="", + tensorboard_folder="", + alert_address=None, + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + ) +) + + +def build_environment(rank, world_size): + import os + + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "12345" + torch.cuda.empty_cache() + # launcher="torch" + internlm.launch_from_torch(config=config, seed=1024) + + +def seed_all(seed, cuda_deterministic=False): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if cuda_deterministic: # slower, more reproducible + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + torch.backends.cudnn.deterministic = False + torch.backends.cudnn.benchmark = True + + +def check_block(args): + # init + rank, world_size = args + build_environment(rank, world_size) + device = torch.device("cuda") + rtol, atol = (1e-3, 5e-3) + + # fix seed + seed_all(1024) + + # define block + blocks = nn.ModuleList( + [ + PackedFlashBaseLayer1D( + hidden_size=4, # 768 + num_attention_heads=2, # 12 + mlp_ratio=2, + attn_drop_rate=0.0, + drop_rate=0.0, + dtype=torch.bfloat16, + layer_norm_epsilon=1e-5, + checkpoint=lid < 0, + layer_idx=lid + 0, # This parameter is used for caching during generation + residual_in_fp32=False, + device=device, + norm_type="rmsnorm", + dropout_selective_checkpoint=True, + use_scaled_init=True, + use_swiglu=True, + ) + for lid in range(4) # 32 + ] + ) + + # create input + cu_seqlens = torch.tensor([0, 2, 4], dtype=torch.int32).to(device) # [0, 8, 16] + indexes = torch.tensor([0, 1, 0, 1]).to(device) # [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7] + hidden_states = torch.tensor([[0, 3, 2, 1]]).to(device) # [[4, 118, 0, 1, 2, 3, 0, 1, 1, 97, 0, 0, 0, 0, 0, 0]] + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + + hidden_states = torch.tensor( + [ + [ + [-1.1620, 1.3113, 0.1507, 2.2698], + [-1.2610, 1.0990, 0.3787, -0.3478], + [1.4001, 1.1982, -0.6696, 0.3269], + [1.3304, 1.2262, 1.0735, -1.1169], + ] + ] + ) + + hidden_states = hidden_states.squeeze(0).to(device).requires_grad_() + + # forward + for _, block in enumerate(blocks): + block = block.to(torch.bfloat16) + block = block.to(device) + hidden_states = block( + hidden_states, + cu_seqlens=cu_seqlens, + indexes=indexes, + inference_params=None, + max_seqlen=max_seqlen, + ) + + result = hidden_states + standard_result = torch.tensor( + [ + [-1.1621, 1.3111, 0.1509, 2.2697], + [-1.2611, 1.0988, 0.3787, -0.3478], + [1.4000, 1.1982, -0.6694, 0.3268], + [1.3303, 1.2262, 1.0736, -1.1169], + ] + ).to(device) + + # check output + assert torch.allclose(result, standard_result, rtol=rtol, atol=atol) + + hidden_states.retain_grad() + loss = torch.randn_like(result) + + # backward + result.backward(loss) + + grad = hidden_states.grad + standard_grad = torch.tensor( + [ + [0.7999, -0.2595, 0.2649, -1.3256], + [0.7064, 0.0283, -0.5508, 0.6494], + [-1.4657, -2.0316, 1.3776, 0.7211], + [-0.6046, 0.4329, -0.1884, 1.1170], + ] + ).to(device) + + # check grad + assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol) + + +def check_head(args): + # init + rank, world_size, is_reward = args + device = torch.device("cuda") + build_environment(rank, world_size) + rtol, atol = (1e-3, 5e-3) + hidden_size = 4 + vocab_size = 4 + embed_grad_scale = 1 + + # fix seed + seed_all(1024) + + # load standard + if is_reward: + head_cls = RewardModelLinear + standard_result = torch.tensor([[3.5938], [1.0703], [3.6250], [3.6250]], dtype=torch.bfloat16).to(device) + standard_grad = torch.tensor( + [ + [-0.2246, 0.0164, -0.0591, 0.1660], + [-0.5625, 0.0408, -0.1484, 0.4160], + [-0.1758, 0.0128, -0.0464, 0.1299], + [-0.4785, 0.0347, -0.1260, 0.3516], + ], + dtype=torch.bfloat16, + ).to(device) + else: + head_cls = ScaleColumnParallelLinear + standard_result = torch.tensor( + [ + [3.5938, -2.2188, 2.0312, 3.5625], + [1.0703, -1.1797, 1.1406, 1.6641], + [3.6250, -2.0156, 1.7656, 3.4531], + [3.6250, -2.0156, 1.7656, 3.4531], + ], + dtype=torch.bfloat16, + ).to(device) + standard_grad = torch.tensor( + [ + [-0.2354, 0.0981, -0.2930, -0.6328], + [0.2344, -0.2334, -0.0918, 0.1396], + [-0.5898, -1.0156, -0.7070, 1.3750], + [0.0242, -0.1494, 0.1206, -0.0427], + ], + dtype=torch.bfloat16, + ).to(device) + + # define head + head = head_cls( + in_features=hidden_size, + out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size, + process_group=gpc.get_group(ParallelMode.TENSOR), + bias=False, + device=device, + dtype=torch.bfloat16, + weight_scale=embed_grad_scale, + ) + + head = head.to(torch.bfloat16) + head = head.to(device) + + # create input + hidden_states = torch.tensor( + [ + [8.3726, 1.9245, 5.5101, 1.0000], + [3.3474, 2.9582, 1.0000, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + ], + dtype=torch.bfloat16, + requires_grad=True, + ).to(device) + + # forward + result = head(hidden_states) + + # check output + assert torch.allclose(result, standard_result, rtol=rtol, atol=atol) + + hidden_states.retain_grad() + loss = torch.randn_like(result) + + # backward + result.backward(loss) + grad = hidden_states.grad + + # check grad + assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol) + + +def check_gather_forward(args): + # init + rank, world_size, parallel_tensor = args + assert parallel_tensor in [1, 2] + config.parallel.tensor = parallel_tensor + device = torch.device("cuda") + build_environment(rank, world_size) + rtol, atol = (1e-3, 5e-3) + + # fix seed + seed_all(1024) + + # load standard + if parallel_tensor == 1: + standard_result = torch.tensor( + [ + [8.3726, 1.9245, 5.5101, 1.0000], + [3.3474, 2.9582, 1.0000, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + ] + ).to(device) + standard_grad = torch.tensor( + [ + [-0.4461, 0.5602, -0.0625, -1.3609], + [0.4353, 1.2988, 0.9595, -0.1144], + [-0.7593, -0.4031, 0.2041, 1.4955], + [0.5706, 0.9047, -0.6965, -0.3757], + ] + ).to(device) + else: + standard_result = torch.tensor( + [ + [8.3726, 1.9245, 5.5101, 1.0000, 8.3726, 1.9245, 5.5101, 1.0000], + [3.3474, 2.9582, 1.0000, 1.0000, 3.3474, 2.9582, 1.0000, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000, 8.3726, 1.2875, 5.5101, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000, 8.3726, 1.2875, 5.5101, 1.0000], + ] + ).to(device) + if rank % 2 == 0: + standard_grad = torch.tensor( + [ + [-0.4461, 0.5602, -0.0625, -1.3609], + [-0.7593, -0.4031, 0.2041, 1.4955], + [0.8093, 1.7580, 1.2996, -0.7545], + [1.0474, -0.5767, -1.0401, 0.8233], + ] + ).to(device) + else: + standard_grad = torch.tensor( + [ + [0.4353, 1.2988, 0.9595, -0.1144], + [0.5706, 0.9047, -0.6965, -0.3757], + [-1.3589, -0.7202, 0.6094, -0.8208], + [-1.0042, 0.3695, 0.2511, -0.2718], + ] + ).to(device) + + # create input + hidden_states = torch.tensor( + [ + [8.3726, 1.9245, 5.5101, 1.0000], + [3.3474, 2.9582, 1.0000, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + ], + requires_grad=True, + ).to(device) + + # forward + result = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1) + + # check output + assert torch.allclose(result, standard_result, rtol=rtol, atol=atol) + + loss = torch.randn_like(result) + hidden_states.retain_grad() + + # backward + result.backward(loss) + grad = hidden_states.grad + + # check grad + assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol) + + +@pytest.mark.block +def test_block(): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map(check_block, [[rank, 8] for rank in range(8)]) + pool.close() + pool.join() + + +@pytest.mark.head +@pytest.mark.parametrize("is_reward", [True, False]) +def test_head(is_reward): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map(check_head, [[rank, 8, is_reward] for rank in range(8)]) + pool.close() + pool.join() + + +@pytest.mark.gather_forward +@pytest.mark.parametrize("parallel_tensor", [1, 2]) +def test_gather_forward(parallel_tensor): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map(check_gather_forward, [[rank, 8, parallel_tensor] for rank in range(8)]) + pool.close() + pool.join() + + +if __name__ == "__main__": + pytest.main(["-s", "-q", "test_model_internlm.py"]) diff --git a/tests/test_model/test_norm.py b/tests/test_model/test_norm.py new file mode 100644 index 0000000..4078ef5 --- /dev/null +++ b/tests/test_model/test_norm.py @@ -0,0 +1,84 @@ +import multiprocessing as mp + +import pytest +import torch + +from internlm.model.utils import try_import_RMSNorm +from tests.test_model.test_model_internlm import build_environment, seed_all + +RMSNorm = try_import_RMSNorm() + + +def check_norm(args): + # init + rank, world_size = args + device = torch.device("cuda") + build_environment(rank, world_size) + rtol, atol = (1e-3, 5e-3) + hidden_size = 4 + layer_norm_epsilon = 1e-05 + + # fix seed + seed_all(1024) + + # define norm + norm = RMSNorm(hidden_size, eps=layer_norm_epsilon) + norm = norm.to(device) + + # create input + hidden_states = torch.tensor( + [ + [8.3726, 1.9245, 5.5101, 1.0000], + [3.3474, 2.9582, 1.0000, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + [8.3726, 1.2875, 5.5101, 1.0000], + ], + requires_grad=True, + ).to(device) + + # forward + result = norm(hidden_states.float()) + + standard = torch.tensor( + [ + [1.6329, 0.3753, 1.0746, 0.1950], + [1.4288, 1.2626, 0.4268, 0.4268], + [1.6490, 0.2536, 1.0852, 0.1970], + [1.6490, 0.2536, 1.0852, 0.1970], + ] + ).to(device) + + # check output + assert torch.allclose(result, standard, rtol=rtol, atol=atol, equal_nan=True) + + hidden_states.retain_grad() + loss = torch.randn_like(result) + + # backward + result.backward(loss) + grad = hidden_states.grad + + standard_grad = torch.tensor( + [ + [-0.0193, 0.1248, 0.0324, -0.2573], + [-0.2140, 0.2010, 0.2901, -0.1683], + [-0.0815, -0.0689, 0.0850, 0.3027], + [0.0847, 0.1739, -0.1554, -0.0773], + ] + ).to(device) + + # check grad + assert torch.allclose(grad, standard_grad, rtol=rtol, atol=atol, equal_nan=True) + + +@pytest.mark.norm +def test_norm(): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map(check_norm, [[rank, 8] for rank in range(8)]) + pool.close() + pool.join() + + +if __name__ == "__main__": + pytest.main(["-s", "-q", "test_norm.py"]) diff --git a/tests/test_solver/test_optimizer.py b/tests/test_solver/test_optimizer.py new file mode 100644 index 0000000..6a22797 --- /dev/null +++ b/tests/test_solver/test_optimizer.py @@ -0,0 +1,364 @@ +import copy +import multiprocessing as mp +import random + +import numpy as np +import pytest +import torch +from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.testing import assert_close + +import internlm +from internlm.core.context.parallel_context import Config +from internlm.solver.optimizer import HybridZeroOptimizer +from internlm.solver.optimizer.utils import ParamBcastSyncHandler + + +class MlpModel(nn.Module): + def __init__(self): + super().__init__() + self.linear1 = nn.Linear(128, 256) + self.linear2 = nn.Linear(256, 512) + + def forward(self, x): + x = self.linear1(x) + x = self.linear2(x) + return x + + +config = Config( + dict( + parallel=dict(zero1=1, pipeline=dict(size=1, interleaved_overlap=False), sequence_parallel=False, tensor=1), + model_type="INTERNLM", + data=dict(seq_len=2048, micro_num=1, micro_bsz=1, pack_sample_into_one=False, min_length=0, total_steps=9999), + model=dict( + dtype=torch.bfloat16, + ), + resume_tb_folder="", + tensorboard_folder="", + alert_address=None, + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + grad_scaler=dict( + fp16=dict( + initial_scale=1, + min_scale=1, + growth_interval=1, + ), + growth_factor=1.1, + backoff_factor=0.9, + max_scale=1, + hysteresis=1, + ), + adam=dict( + lr=1e-4, + adam_beta1=0.9, + adam_beta2=0.95, + adam_beta2_c=0, + adam_eps=1e-8, + weight_decay=0.01, + ), + hybrid_zero_optimizer=dict( + overlap_sync_grad=False, + overlap_sync_param=False, + reduce_bucket_size=512 * 1024 * 1024, + clip_grad_norm=1.0, + ), + ) +) + + +def build_environment(rank, world_size): + import os + + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = "12345" + torch.cuda.empty_cache() + # launcher="torch" + internlm.launch_from_torch(config=config, seed=1024) + + +def loose_close(a, b, dtype: torch.dtype = torch.float32): + + if dtype is torch.float32: + rtol = 1.3e-6 + atol = 1e-5 + elif dtype is torch.bfloat16: + rtol = 2e-2 + atol = 2e-2 + + if isinstance(a, torch.Tensor): + a = a.detach().to(dtype) + b = b.detach().to(dtype) + + assert_close(a, b, rtol=rtol, atol=atol) + + +def init_optimizer_grouped_parameters(check_group, model): + if check_group: + optimizer_grouped_parameters = [ + { + "params": list(model.parameters())[:2], + "weight_decay": config.adam.weight_decay, + }, + { + "params": list(model.parameters())[2:], + "weight_decay": config.adam.weight_decay, + }, + ] + else: + optimizer_grouped_parameters = [{"params": model.parameters(), "weight_decay": config.adam.weight_decay}] + + return optimizer_grouped_parameters + + +def seed_all(seed, cuda_deterministic=False): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if cuda_deterministic: # slower, more reproducible + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + torch.backends.cudnn.deterministic = False + torch.backends.cudnn.benchmark = True + + +def exam_hybrid_zero_optim_with_ddp(args): + # init + rank, world_size, zero_parallel, overlap_sync_param, overlap_sync_grad, micro_num, check_group, dtype = args + # TODO: Need to test the combine of overlap param and group_params when ready + # ParamBcastSyncHandler does not consider paramters in different optimizer group currently + if overlap_sync_param and check_group: + return + config.parallel.zero1 = zero_parallel + config.hybrid_zero_optimizer.overlap_sync_param = overlap_sync_param + config.hybrid_zero_optimizer.overlap_sync_grad = overlap_sync_grad + config.data.micro_num = micro_num + config.model.dtype = dtype + totel_step = 5 + if not overlap_sync_param: + totel_step = 1 + + build_environment(rank, world_size) + seed_all(1024) + + # create models + torch_model = MlpModel().cuda() + zero_model = copy.deepcopy(torch_model).to(dtype) + torch_model = DDP(torch_model.cuda(), static_graph=True).cuda() + + # create optimizer + if config.hybrid_zero_optimizer.overlap_sync_param: + param_bcast_sync_handler = ParamBcastSyncHandler(zero_model) + else: + param_bcast_sync_handler = None + + optimizer_grouped_parameters_zero = init_optimizer_grouped_parameters(check_group, zero_model) + optimizer_grouped_parameters_torch = init_optimizer_grouped_parameters(check_group, torch_model) + + naive_optimizer = torch.optim.AdamW( + params=optimizer_grouped_parameters_zero, + lr=config.adam.lr, + betas=(config.adam.adam_beta1, config.adam.adam_beta2), + eps=config.adam.adam_eps, + ) + + zero_optimizer = HybridZeroOptimizer( + naive_optimizer, + grad_scal_cfg=config.grad_scaler, + zero_cfg=config.hybrid_zero_optimizer, + param_bcast_sync_handler=param_bcast_sync_handler, + ) + + torch_optimizer = torch.optim.AdamW( + params=optimizer_grouped_parameters_torch, + lr=config.adam.lr, + betas=(config.adam.adam_beta1, config.adam.adam_beta2), + eps=config.adam.adam_eps, + ) + + for _ in range(totel_step): + zero_optimizer.zero_grad() + torch_optimizer.zero_grad() + zero_optimizer.skip_grad_reduce = True + for num in range(micro_num): + if num == micro_num - 1: + zero_optimizer.skip_grad_reduce = False + + seed_all(1024 + rank) + # create input + input_data = torch.rand(16, 128).cuda() + + # zero-dp forward + zero_output = zero_model(input_data.to(dtype)) + + # torch-ddp forward + torch_output = torch_model(input_data) + + # check output + loose_close(zero_output, torch_output, dtype=dtype) + + # zero-dp backward + zero_optimizer.backward(zero_output.mean()) + + # torch-ddp backward + if num == micro_num - 1: + torch_output.mean().backward() + else: + with torch_model.no_sync(): + torch_output.mean().backward() + + # zero-dp step + zero_optimizer.step() + + # torch-ddp step + torch_optimizer.step() + + # check grad + if check_group: + group1 = zip(list(torch_model.parameters())[:2], list(zero_model.parameters())[:2]) + group2 = zip(list(torch_model.parameters())[2:], list(zero_model.parameters())[2:]) + for torch_parm, zero_parm in group1: + if zero_parm.grad is not None: + loose_close(torch_parm.grad, zero_parm.grad, dtype=dtype) + for torch_parm, zero_parm in group2: + if zero_parm.grad is not None: + loose_close(torch_parm.grad, zero_parm.grad, dtype=dtype) + else: + for torch_parm, zero_parm in zip(torch_model.parameters(), zero_model.parameters()): + if zero_parm.grad is not None: + loose_close(torch_parm.grad, zero_parm.grad, dtype=dtype) + + torch.cuda.synchronize() + # check updated param + if check_group: + group1 = zip(list(torch_model.parameters())[:2], list(zero_model.parameters())[:2]) + group2 = zip(list(torch_model.parameters())[2:], list(zero_model.parameters())[2:]) + for torch_parm, zero_parm in group1: + loose_close(torch_parm, zero_parm, dtype=dtype) + for torch_parm, zero_parm in group2: + loose_close(torch_parm, zero_parm, dtype=dtype) + else: + for torch_parm, zero_parm in zip(torch_model.parameters(), zero_model.parameters()): + loose_close(torch_parm, zero_parm, dtype=dtype) + + +def exam_hybrid_zero_optim_with_ckpt_load_save(args): + # init + rank, world_size, zero_parallel, check_group, dtype = args + config.parallel.zero1 = zero_parallel + config.parallel.dtype = dtype + + build_environment(rank, world_size) + + # create models + zero_model = MlpModel().cuda().to(dtype) + + # create optimizer + if config.hybrid_zero_optimizer.overlap_sync_param: + param_bcast_sync_handler = ParamBcastSyncHandler(zero_model) + else: + param_bcast_sync_handler = None + + optimizer_grouped_parameters1 = init_optimizer_grouped_parameters(check_group, zero_model) + optimizer_grouped_parameters2 = init_optimizer_grouped_parameters(check_group, zero_model) + + naive_optimizer = torch.optim.AdamW( + params=optimizer_grouped_parameters1, + lr=config.adam.lr, + betas=(config.adam.adam_beta1, config.adam.adam_beta2), + eps=config.adam.adam_eps, + ) + + zero_optimizer = HybridZeroOptimizer( + naive_optimizer, + grad_scal_cfg=config.grad_scaler, + zero_cfg=config.hybrid_zero_optimizer, + param_bcast_sync_handler=param_bcast_sync_handler, + ) + + naive_optimizer2 = torch.optim.AdamW( + params=optimizer_grouped_parameters2, + lr=config.adam.lr, + betas=(config.adam.adam_beta1, config.adam.adam_beta2), + eps=config.adam.adam_eps, + ) + + zero_optimizer2 = HybridZeroOptimizer( + naive_optimizer2, + grad_scal_cfg=config.grad_scaler, + zero_cfg=config.hybrid_zero_optimizer, + param_bcast_sync_handler=param_bcast_sync_handler, + ) + + # save and load states + states = zero_optimizer.state_dict() + zero_optimizer2.load_state_dict(states) + + # check fp32 model weights + for zero1_param, zero2_param in zip( + zero_optimizer._fp32_flat_param_groups_of_current_rank.values(), + zero_optimizer2._fp32_flat_param_groups_of_current_rank.values(), + ): + assert torch.equal(zero1_param, zero2_param) + + # check fp16 model weights + for zero1_param, zero2_param in zip( + zero_optimizer._fp16_param_groups.values(), zero_optimizer2._fp16_param_groups.values() + ): + assert zero1_param == zero2_param + + +zero_parallel_check_list = [-1, 1, 4] +overlap_sync_param_check_list = [True, False] +overlap_sync_grad_check_list = [True, False] +miro_num_check_list = [1, 2, 4] +check_group_list = [True, False] +dtype_list = [torch.float32, torch.bfloat16] + + +@pytest.mark.parametrize("zero_parallel", zero_parallel_check_list) +@pytest.mark.parametrize("overlap_sync_param", overlap_sync_param_check_list) +@pytest.mark.parametrize("overlap_sync_grad", overlap_sync_grad_check_list) +@pytest.mark.parametrize("micro_num", miro_num_check_list) +@pytest.mark.parametrize("check_group", check_group_list) +@pytest.mark.parametrize("dtype", dtype_list) +def test_hybrid_zero_optim_with_ddp( + zero_parallel, overlap_sync_param, overlap_sync_grad, micro_num, check_group, dtype +): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map( + exam_hybrid_zero_optim_with_ddp, + [ + [rank, 8, zero_parallel, overlap_sync_param, overlap_sync_grad, micro_num, check_group, dtype] + for rank in range(8) + ], + ) + pool.close() + pool.join() + + +@pytest.mark.parametrize("zero_parallel", zero_parallel_check_list) +@pytest.mark.parametrize("check_group", check_group_list) +@pytest.mark.parametrize("dtype", dtype_list) +def test_hybrid_zero_optim_with_ckpt_load_save(zero_parallel, check_group, dtype): + ctx = mp.get_context("spawn") + with ctx.Pool(processes=8) as pool: + pool.map( + exam_hybrid_zero_optim_with_ckpt_load_save, + [[rank, 8, zero_parallel, check_group, dtype] for rank in range(8)], + ) + pool.close() + pool.join() + + +if __name__ == "__main__": + pytest.main(["-s", "-q", "test_optimizer.py"]) diff --git a/tools/transformers/convert2hf.py b/tools/transformers/convert2hf.py index 7b44090..f8604df 100644 --- a/tools/transformers/convert2hf.py +++ b/tools/transformers/convert2hf.py @@ -38,7 +38,7 @@ def convert2hf(model_config, states_tp_pps): current_states["lm_head.weight"] = states.pop("head.weight") for i in range(model_config["num_layers"]): - states.pop(f"blocks.{i}.mixer.rotary_emb.inv_freq") + states.pop(f"blocks.{i}.mixer.rotary_emb.inv_freq", None) wqkv = states.pop(f"blocks.{i}.mixer.Wqkv.weight").reshape( 3, model_config["num_attention_heads"], -1, model_config["hidden_size"] diff --git a/tools/transformers/modeling_internlm.py b/tools/transformers/modeling_internlm.py index df1e19f..da7aaa0 100644 --- a/tools/transformers/modeling_internlm.py +++ b/tools/transformers/modeling_internlm.py @@ -20,6 +20,7 @@ """ PyTorch InternLM model.""" import math from typing import List, Optional, Tuple, Union +import threading, queue import torch import torch.utils.checkpoint @@ -810,35 +811,70 @@ class InternLMForCausalLM(InternLMPreTrainedModel): temperature: float = 0.8, top_p: float = 0.8, **kwargs): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + + response_queue = queue.Queue(maxsize=20) + class ChatStreamer(BaseStreamer): def __init__(self, tokenizer) -> None: super().__init__() self.tokenizer = tokenizer - + self.queue = response_queue + self.query = query + self.history = history + self.response = "" + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + def put(self, value): if len(value.shape) > 1 and value.shape[0] > 1: raise ValueError("ChatStreamer only supports batch size 1") elif len(value.shape) > 1: value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + token = self.tokenizer.decode([value[-1]], skip_special_tokens=True) if token.strip() != "": - print(token, end="") - + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + def end(self): - print("") - - return self.chat( - tokenizer=tokenizer, - query=query, - streamer=ChatStreamer(tokenizer=tokenizer), - history=history, - max_new_tokens=max_new_tokens, - do_sample=do_sample, - temperature=temperature, - top_p=top_p, - **kwargs - ) - + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is not None: + return + yield res + + return consumer() + @add_start_docstrings( """