From d4a81fad5d60ab1d0bde039b9295ec59a197b434 Mon Sep 17 00:00:00 2001
From: ly015 <liyining0712@gmail.com>
Date: Tue, 12 Dec 2023 16:17:56 +0800
Subject: [PATCH] modifications by pre-commit hook

---
 .github/workflows/demo_in_readme.yaml         |  4 +-
 .pre-commit-config.yaml                       |  2 +-
 .pylintrc                                     |  2 +-
 README-zh-Hans.md                             | 10 +-
 README.md                                     |  8 +-
 .../locales/en/LC_MESSAGES/checkpoint.po      |  1 -
 .../en/LC_MESSAGES/example/30B_demo.po        |  1 -
 .../locales/en/LC_MESSAGES/example/7B_demo.po |  1 -
 .../locales/en/LC_MESSAGES/example/index.po   |  1 -
 doc/code-docs/locales/en/LC_MESSAGES/index.po |  1 -
 .../locales/en/LC_MESSAGES/initialize.po      |  1 -
 .../locales/en/LC_MESSAGES/install.po         |  1 -
 .../locales/en/LC_MESSAGES/monitor.po         |  1 -
 .../locales/en/LC_MESSAGES/parallel.po        |  1 -
 .../locales/en/LC_MESSAGES/profiler.po        |  1 -
 doc/code-docs/locales/en/LC_MESSAGES/qa.po    |  1 -
 .../locales/en/LC_MESSAGES/training.po        |  1 -
 doc/code-docs/locales/en/LC_MESSAGES/usage.po |  1 -
 doc/code-docs/requirements.txt                |  2 +-
 doc/code-docs/source/example/30B_demo.rst     | 12 +--
 doc/code-docs/source/example/7B_demo.rst      | 12 +--
 doc/code-docs/source/initialize.rst           |  4 +-
 doc/code-docs/source/install.md               |  2 +-
 doc/code-docs/source/parallel.rst             |  2 +-
 doc/code-docs/source/qa.rst                   |  2 +-
 doc/code-docs/source/training.rst             |  2 +-
 doc/code-docs/source/usage.md                 |  2 +-
 doc/en/install.md                             |  6 +-
 doc/en/train_performance.md                   |  7 +-
 doc/install.md                                |  6 +-
 doc/train_performance.md                      |  7 +-
 docker.Makefile                               |  2 +-
 docker/Dockerfile-centos                      |  2 +-
 docker/Dockerfile-ubuntu                      |  2 +-
 experiment/Dockerfile-centos                  |  2 +-
 experiment/Dockerfile-ubuntu                  |  2 +-
 experiment/README-CN.md                       |  2 +-
 experiment/README-EN.md                       |  2 +-
 requirements/runtime.txt                      |  2 +-
 requirements/torch.txt                        |  2 +-
 tools/pal_inference.py                        |  4 +-
 tools/transformers/README-zh-Hans.md          |  2 +-
 tools/transformers/README.md                  |  2 +-
 tools/transformers/configuration_internlm.py  |  9 +-
 tools/transformers/convert2hf.py              | 10 +-
 tools/transformers/interface.py               | 13 +--
 tools/transformers/intern_moss_example.py     | 39 +++++---
 tools/transformers/internlm_sft_on_moss.py    | 33 ++++---
 tools/transformers/modeling_internlm.py       | 96 +++++++++++--------
 tools/transformers/tokenization_internlm.py   |  4 +-
 50 files changed, 175 insertions(+), 160 deletions(-)

diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml
index a3d4cd9..70c35ea 100644
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@@ -1,5 +1,5 @@
 name: demo-in-readme
-on: 
+on:
   pull_request:
     branches:
       - "main"
@@ -83,7 +83,7 @@ jobs:
         source activate internlm-env-test
         export PYTHONPATH=$PWD:$PYTHONPATH
         sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
-        rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak 
+        rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
 
     - name: torchrun-train
       run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 19cd7c8..8a43efd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,4 +50,4 @@ repos:
             [
                 '--rcfile=.pylintrc',
                 '--disable=C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203'
-            ]
\ No newline at end of file
+            ]
diff --git a/.pylintrc b/.pylintrc
index ca6ec3b..73cb3ae 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -425,4 +425,4 @@ valid-metaclass-classmethod-first-arg=mcs
 # Exceptions that will emit a warning when being caught. Defaults to
 # "Exception"
 overgeneral-exceptions=builtins.BaseException,
-                       builtins.Exception
\ No newline at end of file
+                       builtins.Exception
diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index 461e22c..ff9b40c 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -60,13 +60,13 @@ InternLM 是一个开源的轻量级训练框架，旨在支持大模型训练
 | **InternLM 7B**           | [🤗internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b)                     | [<img src="./doc/imgs/modelscope_logo.png" width="20px" /> Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)                     | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b)           | 2023-07-06   |
 
 
-<details> 
+<details>
 <summary> InternLM-20B </summary>
 
 #### 简介
-InternLM-20B 在超过 **2.3T** Tokens 包含高质量英文、中文和代码的数据上进行预训练，其中 Chat 版本还经过了 SFT 和 RLHF 训练，使其能够更好、更安全地满足用户的需求。  
+InternLM-20B 在超过 **2.3T** Tokens 包含高质量英文、中文和代码的数据上进行预训练，其中 Chat 版本还经过了 SFT 和 RLHF 训练，使其能够更好、更安全地满足用户的需求。
 
-InternLM 20B 在模型结构上选择了深结构，InternLM-20B 的层数设定为60层，超过常规7B和13B模型所使用的32层或者40层。在参数受限的情况下，提高层数有利于提高模型的综合能力。此外，相较于InternLM-7B，InternLM-20B使用的预训练数据经过了更高质量的清洗，并补充了高知识密度和用于强化理解和推理能力的训练数据。因此，它在理解能力、推理能力、数学能力、编程能力等考验语言模型技术水平的方面都得到了显著提升。总体而言，InternLM-20B具有以下的特点： 
+InternLM 20B 在模型结构上选择了深结构，InternLM-20B 的层数设定为60层，超过常规7B和13B模型所使用的32层或者40层。在参数受限的情况下，提高层数有利于提高模型的综合能力。此外，相较于InternLM-7B，InternLM-20B使用的预训练数据经过了更高质量的清洗，并补充了高知识密度和用于强化理解和推理能力的训练数据。因此，它在理解能力、推理能力、数学能力、编程能力等考验语言模型技术水平的方面都得到了显著提升。总体而言，InternLM-20B具有以下的特点：
 - 优异的综合性能
 - 很强的工具调用功能
 - 支持16k语境长度（通过推理时外推）
@@ -115,7 +115,7 @@ InternLM 20B 在模型结构上选择了深结构，InternLM-20B 的层数设定
 </details>
 
 
-<details> 
+<details>
 <summary> InternLM-7B </summary>
 
 #### 模型更新
@@ -175,7 +175,7 @@ InternLM-7B 包含了一个拥有70亿参数的基础模型和一个为实际场
 3. 集中注意力：避免分心，集中注意力完成任务。关闭社交媒体和电子邮件通知，专注于任务，这将帮助您更快地完成任务，并减少错误的可能性。
 ```
 
-### 通过 ModelScope 加载 
+### 通过 ModelScope 加载
 
 通过以下的代码从 ModelScope 加载 InternLM 模型 （可修改模型名称替换不同的模型）
 
diff --git a/README.md b/README.md
index fb50a4a..7f468e2 100644
--- a/README.md
+++ b/README.md
@@ -45,13 +45,13 @@ Based on the InternLM training framework, we have released two open-sourced pret
 ## News
 
 [20231213] InternLM-7B-Chat and InternLM-20B-Chat checkpoints are updated. With an improved finetuning strategy, the new chat models can generate higher quality responces with greater stylistic diversity.
-[20230920] InternLM-20B is released with base and chat versions.  
+[20230920] InternLM-20B is released with base and chat versions.
 
 
 ## Model Zoo
 
-Our models are released in three platforms: Transformers, ModelScope and OpenXLab.  
-- There are two kinds of model weights: 
+Our models are released in three platforms: Transformers, ModelScope and OpenXLab.
+- There are two kinds of model weights:
   1. huggingface type(marked as HF)
   2. original model weight(marked as Original), providing in OpenXLab, which can be loaded by InternLM and finetuned directly.
 
@@ -114,7 +114,7 @@ Overall, InternLM-20B comprehensively outperforms open-source models in the 13B
 </details>
 
 
-<details> 
+<details>
 <summary> InternLM-7B </summary>
 
 #### News
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
index bd81fa5..3ddcb09 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
@@ -103,4 +103,3 @@ msgstr ""
 
 #~ msgid "traning dataloader object"
 #~ msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
index 67f2451..6ac0e3b 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
@@ -47,4 +47,3 @@ msgstr "Training Results"
 #: ../../source/example/30B_demo.rst:175 615a3481b0aa49729b7219b1365519aa
 msgid "基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下："
 msgstr "Taking the configuration of the demo training on two nodes with 16 GPUs on slurm as an example, the training result log is shown below:"
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
index ccc6bca..5e99429 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
@@ -47,4 +47,3 @@ msgstr "Training Results"
 #: ../../source/example/7B_demo.rst:173 33ec81f34e3c4340beacdb5254069d08
 msgid "基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下："
 msgstr "Taking the configuration of the demo training on a single machine with 8 GPUs on slurm as an example, the training result log is shown below:"
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/example/index.po b/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
index 6324e15..752345e 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
@@ -30,4 +30,3 @@ msgstr ""
 #: ../../source/example/index.rst:13 b095e27dfc924a7a943b7cba5361700a
 msgid "30B Demo"
 msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/index.po b/doc/code-docs/locales/en/LC_MESSAGES/index.po
index 25645c6..69a862b 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/index.po
@@ -78,4 +78,3 @@ msgstr ""
 #: ../../source/index.rst:95 a164b772960f4ab8b18c7e8820f69f55
 msgid ":ref:`search`"
 msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
index 3303581..632b470 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
@@ -245,4 +245,3 @@ msgid ""
 "A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``"
 " where only ``trainer`` could not be None."
 msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/install.po b/doc/code-docs/locales/en/LC_MESSAGES/install.po
index 4ab8915..7abeb3a 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/install.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/install.po
@@ -137,4 +137,3 @@ msgstr "For the local standard image built with dockerfile or pulled, use the fo
 #: ../../../install.md:87 66613606256e4094a6be5ab2af1269ae
 msgid "容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。"
 msgstr "The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md)."
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
index 0108368..c0ec5f5 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
@@ -195,4 +195,3 @@ msgstr ""
 #: internlm.monitor.alert.send_feishu_msg_with_webhook:12 of
 msgid "An exception rasied by the HTTP post request."
 msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/parallel.po b/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
index 15a8d23..d9770dc 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
@@ -454,4 +454,3 @@ msgstr ""
 #: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:7 of
 msgid "Whether the gradient is success updated, and the gradient."
 msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
index 38858cd..3acae56 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
@@ -172,4 +172,3 @@ msgstr ""
 #: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:1 of
 msgid "Update the memory state of the optimizer state."
 msgstr ""
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/qa.po b/doc/code-docs/locales/en/LC_MESSAGES/qa.po
index 0b32bb6..651a825 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/qa.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/qa.po
@@ -22,4 +22,3 @@ msgstr ""
 #: ../../source/qa.rst:2 e3b22a39640a40cfb527068a7f4bbfc9
 msgid "问&答"
 msgstr "Q&A"
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/training.po b/doc/code-docs/locales/en/LC_MESSAGES/training.po
index 05c834f..b8771f3 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/training.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/training.po
@@ -159,4 +159,3 @@ msgstr ""
 
 #~ msgid "InternLM训练流程图"
 #~ msgstr "InternLM training process"
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/usage.po b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
index 37e7cba..8717ecf 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
@@ -364,4 +364,3 @@ msgstr ""
 #~ msgstr ""
 #~ "`load_model_only_folder` and `load_ckpt_folder` "
 #~ "cannot be set at the same time."
-
diff --git a/doc/code-docs/requirements.txt b/doc/code-docs/requirements.txt
index 604cb2c..f44a0ae 100644
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@@ -8,4 +8,4 @@ numpy
 torch
 tqdm
 pyecharts
-myst-parser
\ No newline at end of file
+myst-parser
diff --git a/doc/code-docs/source/example/30B_demo.rst b/doc/code-docs/source/example/30B_demo.rst
index d1182f9..47f8dcc 100644
--- a/doc/code-docs/source/example/30B_demo.rst
+++ b/doc/code-docs/source/example/30B_demo.rst
@@ -194,9 +194,9 @@
     2023-09-06 10:29:27,271 INFO parallel_context.py:508 in set_device -- process rank 8 is bound to host:HOST-10-140-66-20 device: 0
     2023-09-06 10:29:32,060 INFO launch.py:329 in launch -- Distributed environment is initialized, data parallel size: 4, pipeline parallel size: 1, tensor parallel size: 4
     2023-09-06 10:30:06,141 INFO hybrid_zero_optim.py:291 in _partition_param_list -- Number of elements on ranks: [1782007296, 1812307968, 1812307968, 1706469888], rank:0
-    2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan 
-    2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan 
-    2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan 
-    2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan 
-    2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan 
-    2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan
\ No newline at end of file
+    2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan
+    2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan
+    2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan
+    2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan
+    2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan
+    2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan
diff --git a/doc/code-docs/source/example/7B_demo.rst b/doc/code-docs/source/example/7B_demo.rst
index 8b8c97b..7815417 100644
--- a/doc/code-docs/source/example/7B_demo.rst
+++ b/doc/code-docs/source/example/7B_demo.rst
@@ -184,9 +184,9 @@
     2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:SH-IDC1-10-140-1-110 device: 0
     2023-09-05 11:47:51,006 INFO launch.py:354 in launch -- Distributed environment is initialized, data parallel size: 8, pipeline parallel size: 1, tensor parallel size: 1
     2023-09-05 11:49:09,855 INFO hybrid_zero_optim.py:294 in _partition_param_list -- Number of elements on ranks: [894509056, 944865280, 966909952, 966909952, 966909952, 944865280, 966909952, 670068736], rank:0
-    2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan 
-    2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan 
-    2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan 
-    2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan 
-    2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan 
-    2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan 
+    2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan
+    2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan
+    2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan
+    2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan
+    2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan
+    2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan
diff --git a/doc/code-docs/source/initialize.rst b/doc/code-docs/source/initialize.rst
index e94f4f6..fca87a5 100644
--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@@ -9,7 +9,7 @@ InternLM 的训练流程可以归纳为两个步骤：
     * 初始化Logger、Checkpoint管理器、Monitor管理器、Profiler，对迭代训练的过程观察、预警、记录。
 
 2. 迭代训练
-   
+
     * 根据配置文件定义的张量并行、流水线并行、数据并行的大小，加载训练引擎和调度器进行混合并行训练。
     * 在迭代训练中，调用 Trainer API 进行梯度置零，前向传播计算损失并反向传播，参数更新。
 
@@ -105,4 +105,4 @@ InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制
 Trainer 初始化
 -------------------------
 
-.. autofunction:: internlm.initialize.initialize_trainer
\ No newline at end of file
+.. autofunction:: internlm.initialize.initialize_trainer
diff --git a/doc/code-docs/source/install.md b/doc/code-docs/source/install.md
index cbe6fbe..912270c 100644
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@@ -1,2 +1,2 @@
 ```{include} ../../install.md
-```
\ No newline at end of file
+```
diff --git a/doc/code-docs/source/parallel.rst b/doc/code-docs/source/parallel.rst
index 5f593c0..6de9545 100644
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@@ -133,7 +133,7 @@ ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zer
 
     hybrid_zero_optimizer = dict(
         # Enable low_level_optimzer overlap_communication
-        overlap_sync_grad=True,  
+        overlap_sync_grad=True,
         overlap_sync_param=True,
         # bucket size for nccl communication params
         reduce_bucket_size=512 * 1024 * 1024,
diff --git a/doc/code-docs/source/qa.rst b/doc/code-docs/source/qa.rst
index 3912bb3..b32859f 100644
--- a/doc/code-docs/source/qa.rst
+++ b/doc/code-docs/source/qa.rst
@@ -1,2 +1,2 @@
 问&答
-=====
\ No newline at end of file
+=====
diff --git a/doc/code-docs/source/training.rst b/doc/code-docs/source/training.rst
index 279d795..19bf80c 100644
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@@ -6,4 +6,4 @@ InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` 管理。在定义
 有关详细用法，请参阅 Trainer API 文档和示例。
 
 .. autoclass:: internlm.core.trainer.Trainer
-    :members:
\ No newline at end of file
+    :members:
diff --git a/doc/code-docs/source/usage.md b/doc/code-docs/source/usage.md
index 056a7b3..7632959 100644
--- a/doc/code-docs/source/usage.md
+++ b/doc/code-docs/source/usage.md
@@ -1,4 +1,4 @@
 ```{include} ../../usage.md
 :relative-docs: docs/
 :relative-images:
-```
\ No newline at end of file
+```
diff --git a/doc/en/install.md b/doc/en/install.md
index c216293..8885037 100644
--- a/doc/en/install.md
+++ b/doc/en/install.md
@@ -35,8 +35,8 @@ It is recommended to build a Python-3.10 virtual environment using conda and ins
 conda create --name internlm-env python=3.10 -y
 conda activate internlm-env
 cd internlm
-pip install -r requirements/torch.txt 
-pip install -r requirements/runtime.txt 
+pip install -r requirements/torch.txt
+pip install -r requirements/runtime.txt
 ```
 
 Install flash-attention (version v1.0.5):
@@ -65,7 +65,7 @@ Users can use the provided dockerfile combined with docker.Makefile to build the
 The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
 ``` bash
 make -f docker.Makefile BASE_OS=centos7
-``` 
+```
 In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
 
 #### Pull Standard Image
diff --git a/doc/en/train_performance.md b/doc/en/train_performance.md
index 9c77d9e..823ecce 100644
--- a/doc/en/train_performance.md
+++ b/doc/en/train_performance.md
@@ -58,12 +58,12 @@ When `Activation Ckpt` is enabled，the test results are shown in the table belo
 | TP | Zero1 | Pack Sample Into One | Activation Ckpt | GPU Num | Seq Len | Micro Bsz | Micro Num | Global Bsz | TGS | TFLOPS |
 |-|-|-|-|-|-|-|-|-|-|-|
 | 1 | 8 | TRUE | TRUE | 8 | 2048 | 8 | 1 | 0.125M | 3314 | 193 |
-| 1 | 8 | TRUE | TRUE | 16 | 2048 | 8 | 1 | 0.25M | 3268 | 191 |  
+| 1 | 8 | TRUE | TRUE | 16 | 2048 | 8 | 1 | 0.25M | 3268 | 191 |
 | 1 | 8 | TRUE | TRUE | 32 | 2048 | 8 | 1 | 0.5M | 3323 | 188 |
 | 1 | 8 | TRUE | TRUE | 64 | 2048 | 8 | 1 | 1M | 3217 | 188 |
 | 1 | 8 | TRUE | TRUE | 128 | 2048 | 8 | 1 | 2M | 3260 | 187 |
 | 1 | 8 | TRUE | TRUE | 256 | 2048 | 8 | 1 | 4M | 3215 | 187 |
-| 1 | 8 | TRUE | TRUE | 512 | 2048 | 8 | 1 | 8M | 3199 | 186 |  
+| 1 | 8 | TRUE | TRUE | 512 | 2048 | 8 | 1 | 8M | 3199 | 186 |
 | 1 | 8 | TRUE | TRUE | 1024 | 2048 | 8 | 1 | 16M | 3163 | 184 |
 | 1 | 8 | TRUE | TRUE | 512 | 2048 | 4 | 1 | 4M | 2963 | 173 |
 | 1 | 8 | TRUE | TRUE | 1024 | 2048 | 2 | 1 | 4M | 2341 | 136 |
@@ -81,7 +81,7 @@ When `Activation Ckpt` is turned off, the test results are as shown in the table
 | 1 | 8 | TRUE | FALSE | 256 | 2048 | 2 | 4 | 4M | 3920 | 173 |
 | 1 | 8 | TRUE | FALSE | 512 | 2048 | 2 | 4 | 8M | 3900 | 173 |
 | 1 | 8 | TRUE | FALSE | 1024 | 2048 | 2 | 4 | 16M | 3625 | 160 |
-| 1 | 8 | TRUE | FALSE | 512 | 2048 | 2 | 2 | 4M | 3084 | 139 |  
+| 1 | 8 | TRUE | FALSE | 512 | 2048 | 2 | 2 | 4M | 3084 | 139 |
 | 1 | 8 | TRUE | FALSE | 1024 | 2048 | 2 | 1 | 4M | 2346 | 105 |
 | 1 | 8 | TRUE | FALSE | 1024 | 2048 | 2 | 2 | 8M | 2817 | 124 |
 
@@ -90,4 +90,3 @@ When `Activation Ckpt` is turned off, the test results are as shown in the table
 <div align="left">
     <img src="../imgs/flops.png" width="580"/>
 </div>
-
diff --git a/doc/install.md b/doc/install.md
index 8994129..63d392b 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -35,8 +35,8 @@ git clone git@github.com:InternLM/InternLM.git --recurse-submodules
 conda create --name internlm-env python=3.10 -y
 conda activate internlm-env
 cd internlm
-pip install -r requirements/torch.txt 
-pip install -r requirements/runtime.txt 
+pip install -r requirements/torch.txt
+pip install -r requirements/runtime.txt
 ```
 
 安装 flash-attention (version v1.0.5)：
@@ -65,7 +65,7 @@ cd ../../
 dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build 镜像：
 ``` bash
 make -f docker.Makefile BASE_OS=centos7
-``` 
+```
 在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
 
 #### 镜像拉取
diff --git a/doc/train_performance.md b/doc/train_performance.md
index 239e20f..64c768e 100644
--- a/doc/train_performance.md
+++ b/doc/train_performance.md
@@ -57,12 +57,12 @@ InternLM中`zero1`的配置决定了优化器状态的分配范围。
 | TP | Zero1 | Pack Sample Into One | Activation Ckpt | GPU Num | Seq Len | Micro Bsz | Micro Num | Global Bsz | TGS | TFLOPS |
 |-|-|-|-|-|-|-|-|-|-|-|
 | 1 | 8 | TRUE | TRUE | 8 | 2048 | 8 | 1 | 0.125M | 3314 | 193 |
-| 1 | 8 | TRUE | TRUE | 16 | 2048 | 8 | 1 | 0.25M | 3268 | 191 |  
+| 1 | 8 | TRUE | TRUE | 16 | 2048 | 8 | 1 | 0.25M | 3268 | 191 |
 | 1 | 8 | TRUE | TRUE | 32 | 2048 | 8 | 1 | 0.5M | 3323 | 188 |
 | 1 | 8 | TRUE | TRUE | 64 | 2048 | 8 | 1 | 1M | 3217 | 188 |
 | 1 | 8 | TRUE | TRUE | 128 | 2048 | 8 | 1 | 2M | 3260 | 187 |
 | 1 | 8 | TRUE | TRUE | 256 | 2048 | 8 | 1 | 4M | 3215 | 187 |
-| 1 | 8 | TRUE | TRUE | 512 | 2048 | 8 | 1 | 8M | 3199 | 186 |  
+| 1 | 8 | TRUE | TRUE | 512 | 2048 | 8 | 1 | 8M | 3199 | 186 |
 | 1 | 8 | TRUE | TRUE | 1024 | 2048 | 8 | 1 | 16M | 3163 | 184 |
 | 1 | 8 | TRUE | TRUE | 512 | 2048 | 4 | 1 | 4M | 2963 | 173 |
 | 1 | 8 | TRUE | TRUE | 1024 | 2048 | 2 | 1 | 4M | 2341 | 136 |
@@ -80,11 +80,10 @@ InternLM中`zero1`的配置决定了优化器状态的分配范围。
 | 1 | 8 | TRUE | FALSE | 256 | 2048 | 2 | 4 | 4M | 3920 | 173 |
 | 1 | 8 | TRUE | FALSE | 512 | 2048 | 2 | 4 | 8M | 3900 | 173 |
 | 1 | 8 | TRUE | FALSE | 1024 | 2048 | 2 | 4 | 16M | 3625 | 160 |
-| 1 | 8 | TRUE | FALSE | 512 | 2048 | 2 | 2 | 4M | 3084 | 139 |  
+| 1 | 8 | TRUE | FALSE | 512 | 2048 | 2 | 2 | 4M | 3084 | 139 |
 | 1 | 8 | TRUE | FALSE | 1024 | 2048 | 2 | 1 | 4M | 2346 | 105 |
 | 1 | 8 | TRUE | FALSE | 1024 | 2048 | 2 | 2 | 8M | 2817 | 124 |
 
 <div align="left">
     <img src="../doc/imgs/flops.png" width="580"/>
 </div>
-
diff --git a/docker.Makefile b/docker.Makefile
index 21ce55a..7cfd55a 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -104,4 +104,4 @@ devel-image:
 
 .PHONY: clean
 clean:
-	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
\ No newline at end of file
+	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
diff --git a/docker/Dockerfile-centos b/docker/Dockerfile-centos
index 917d28f..eed33c8 100644
--- a/docker/Dockerfile-centos
+++ b/docker/Dockerfile-centos
@@ -128,4 +128,4 @@ RUN git submodule update --init --recursive \
     && cd ./third_party/apex \
     && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
     && /opt/conda/bin/pip cache purge \
-    && rm -rf ~/.cache/pip
\ No newline at end of file
+    && rm -rf ~/.cache/pip
diff --git a/docker/Dockerfile-ubuntu b/docker/Dockerfile-ubuntu
index e73421a..a7c5526 100644
--- a/docker/Dockerfile-ubuntu
+++ b/docker/Dockerfile-ubuntu
@@ -109,4 +109,4 @@ RUN git submodule update --init --recursive \
     && cd ./third_party/apex \
     && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
     && /opt/conda/bin/pip cache purge \
-    && rm -rf ~/.cache/pip
\ No newline at end of file
+    && rm -rf ~/.cache/pip
diff --git a/experiment/Dockerfile-centos b/experiment/Dockerfile-centos
index 31ffc19..a1b1424 100644
--- a/experiment/Dockerfile-centos
+++ b/experiment/Dockerfile-centos
@@ -158,4 +158,4 @@ RUN git submodule update --init --recursive \
     && cd ./third_party/apex \
     && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
     && /opt/conda/bin/pip cache purge \
-    && rm -rf ~/.cache/pip
\ No newline at end of file
+    && rm -rf ~/.cache/pip
diff --git a/experiment/Dockerfile-ubuntu b/experiment/Dockerfile-ubuntu
index 230a3b5..ed78d50 100644
--- a/experiment/Dockerfile-ubuntu
+++ b/experiment/Dockerfile-ubuntu
@@ -139,4 +139,4 @@ RUN git submodule update --init --recursive \
     && cd ./third_party/apex \
     && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
     && /opt/conda/bin/pip cache purge \
-    && rm -rf ~/.cache/pip
\ No newline at end of file
+    && rm -rf ~/.cache/pip
diff --git a/experiment/README-CN.md b/experiment/README-CN.md
index 1f96cc7..7fee559 100644
--- a/experiment/README-CN.md
+++ b/experiment/README-CN.md
@@ -22,4 +22,4 @@ docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
 ```bash
 docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
 ```
-容器内默认目录即 `/InternLM`，根据[使用文档](../doc/usage.md)即可启动训练。
\ No newline at end of file
+容器内默认目录即 `/InternLM`，根据[使用文档](../doc/usage.md)即可启动训练。
diff --git a/experiment/README-EN.md b/experiment/README-EN.md
index f9bae2b..f68efc8 100644
--- a/experiment/README-EN.md
+++ b/experiment/README-EN.md
@@ -22,4 +22,4 @@ For the local standard image built with dockerfile or pulled, use the following
 ```bash
 docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
 ```
-The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
\ No newline at end of file
+The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index f46d7ad..e60ee2f 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -13,4 +13,4 @@ boto3
 botocore
 torch-scatter
 pyecharts
--f https://data.pyg.org/whl/torch-1.13.1+cu117.html
\ No newline at end of file
+-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
diff --git a/requirements/torch.txt b/requirements/torch.txt
index 3b428b0..4b1efcb 100644
--- a/requirements/torch.txt
+++ b/requirements/torch.txt
@@ -1,4 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu117
 torch==1.13.1+cu117
 torchvision==0.14.1+cu117
-torchaudio==0.13.1
\ No newline at end of file
+torchaudio==0.13.1
diff --git a/tools/pal_inference.py b/tools/pal_inference.py
index a3c0cc2..648ec58 100644
--- a/tools/pal_inference.py
+++ b/tools/pal_inference.py
@@ -1,3 +1,5 @@
+# flake8: noqa
+
 # This file is modified from:
 # hhttps://github.com/reasoning-machines/pal/blob/main/pal/core/interface.py
 #
@@ -27,8 +29,8 @@ import tqdm
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from tools.transformers.interface import GenerationConfig, generate_interactive
 from internlm.utils.timeout import Timeout
+from tools.transformers.interface import GenerationConfig, generate_interactive
 
 
 def parse_args():
diff --git a/tools/transformers/README-zh-Hans.md b/tools/transformers/README-zh-Hans.md
index 8bbdaf5..34f12fe 100644
--- a/tools/transformers/README-zh-Hans.md
+++ b/tools/transformers/README-zh-Hans.md
@@ -1,7 +1,7 @@
 # InternLM Transformers
 
 [English](./README.md) |
-[简体中文](./README-zh-Hans.md) 
+[简体中文](./README-zh-Hans.md)
 
 该文件夹下包含了 transformers 格式的 `InternLM` 模型。
 
diff --git a/tools/transformers/README.md b/tools/transformers/README.md
index 4fe2a92..6b453f3 100644
--- a/tools/transformers/README.md
+++ b/tools/transformers/README.md
@@ -1,7 +1,7 @@
 # InternLM Transformers
 
 [English](./README.md) |
-[简体中文](./README-zh-Hans.md) 
+[简体中文](./README-zh-Hans.md)
 
 This folder contains the `InternLM` model in transformers format.
 
diff --git a/tools/transformers/configuration_internlm.py b/tools/transformers/configuration_internlm.py
index 298f913..ebeb27d 100644
--- a/tools/transformers/configuration_internlm.py
+++ b/tools/transformers/configuration_internlm.py
@@ -19,9 +19,8 @@
 # limitations under the License.
 """ InternLM model configuration"""
 
-from transformers.utils import logging
 from transformers.configuration_utils import PretrainedConfig
-
+from transformers.utils import logging
 
 logger = logging.get_logger(__name__)
 
@@ -30,9 +29,9 @@ INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 
 class InternLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate an InternLM
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the InternLM-7B.
+    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate an
+    InternLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
diff --git a/tools/transformers/convert2hf.py b/tools/transformers/convert2hf.py
index f8604df..594ab88 100644
--- a/tools/transformers/convert2hf.py
+++ b/tools/transformers/convert2hf.py
@@ -1,6 +1,6 @@
 import argparse
-import math
 import json
+import math
 import os
 import re
 import tempfile
@@ -110,7 +110,7 @@ def merge_pp(states_tp_pp):
             states = states_tp_pp[tp][pp]
             keys = list(states.keys())
             for key in keys:
-                match = re.search("\.\d+\.", key)
+                match = re.search("\.\d+\.", key)  # noqa: W605
                 if match is not None:
                     s, e = match.span()
                     layer_idx = int(key[s + 1 : e - 1]) + layer_shift
@@ -126,9 +126,9 @@ def merge_pp(states_tp_pp):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--src_folder', type=str, default='~/test/') # 需要转换为hf格式的checkpoint文件夹
-    parser.add_argument('--tgt_folder', type=str, default='~/output/') # 存放转换后checkpoint的目标文件夹
-    parser.add_argument('--tokenizer', type=str, default='~/test/tokenizer.model') # Tokenizer 文件的路径
+    parser.add_argument("--src_folder", type=str, default="~/test/")  # 需要转换为hf格式的checkpoint文件夹
+    parser.add_argument("--tgt_folder", type=str, default="~/output/")  # 存放转换后checkpoint的目标文件夹
+    parser.add_argument("--tokenizer", type=str, default="~/test/tokenizer.model")  # Tokenizer 文件的路径
     args = parser.parse_args()
 
     def load(fp):
diff --git a/tools/transformers/interface.py b/tools/transformers/interface.py
index 1a8a69f..50fff85 100644
--- a/tools/transformers/interface.py
+++ b/tools/transformers/interface.py
@@ -5,7 +5,6 @@ from typing import Callable, List, Optional
 
 import torch
 from torch import nn
-from transformers import AutoModel, AutoTokenizer
 from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList
 from transformers.utils import logging
 
@@ -23,7 +22,7 @@ class GenerationConfig:
 
 @torch.inference_mode()
 def generate_interactive(
-    model, 
+    model,
     tokenizer,
     prompt,
     generation_config: Optional[GenerationConfig] = None,
@@ -38,12 +37,12 @@ def generate_interactive(
     for k, v in inputs.items():
         inputs[k] = v.cuda()
     input_ids = inputs["input_ids"]
-    batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+    batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]  # noqa: F841
     if generation_config is None:
         generation_config = model.generation_config
     generation_config = copy.deepcopy(generation_config)
     model_kwargs = generation_config.update(**kwargs)
-    bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+    bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id  # noqa: F841
     if isinstance(eos_token_id, int):
         eos_token_id = [eos_token_id]
     if additional_eos_token_id is not None:
@@ -119,11 +118,9 @@ def generate_interactive(
 
         # update generated ids, model inputs, and length for next step
         input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-        model_kwargs = model._update_model_kwargs_for_generation(
-            outputs, model_kwargs, is_encoder_decoder=False
-        )
+        model_kwargs = model._update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False)
         unfinished_sequences = unfinished_sequences.mul((min(next_tokens != i for i in eos_token_id)).long())
-        
+
         output_token_ids = input_ids[0].cpu().tolist()
         output_token_ids = output_token_ids[input_length:]
         for each_eos_token_id in eos_token_id:
diff --git a/tools/transformers/intern_moss_example.py b/tools/transformers/intern_moss_example.py
index d8cf675..303efac 100644
--- a/tools/transformers/intern_moss_example.py
+++ b/tools/transformers/intern_moss_example.py
@@ -1,11 +1,13 @@
 import torch
+from moss_002_sft import collate_fn, get_dataset
+from peft import LoraConfig, TaskType, get_peft_model
 from torch.utils.data import DataLoader
-from peft import get_peft_model, LoraConfig, TaskType
-from transformers import get_linear_schedule_with_warmup
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from tqdm import tqdm
-
-from moss_002_sft import get_dataset, collate_fn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+)
 
 model_path = "model_path"
 data_dir = "moss_002_sft"
@@ -16,8 +18,11 @@ epochs = 5
 val_per_steps = 1000
 lr = 9e-6
 peft_config = LoraConfig(
-    task_type=TaskType.CAUSAL_LM, r=32, lora_alpha=32, lora_dropout=0.1,
-    target_modules=["gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj", "o_proj"]
+    task_type=TaskType.CAUSAL_LM,
+    r=32,
+    lora_alpha=32,
+    lora_dropout=0.1,
+    target_modules=["gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj", "o_proj"],
 )
 
 
@@ -29,12 +34,12 @@ model.cuda()
 
 # dataset
 train_dataset, val_dataset = get_dataset(tokenizer, data_dir, num=data_num, test_size=test_size)
-train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer))
+train_dataloader = DataLoader(
+    train_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer)
+)
 
 optimizer = torch.optim.AdamW(model.parameters(), lr)
-scheduler = get_linear_schedule_with_warmup(
-    optimizer, 1000, epochs * len(train_dataloader)
-)
+scheduler = get_linear_schedule_with_warmup(optimizer, 1000, epochs * len(train_dataloader))
 
 # train
 fp = open("output", "w")
@@ -42,7 +47,7 @@ model.train()
 for epoch in tqdm(range(epochs), desc="Traning Epoch"):
     batch_bar = tqdm(train_dataloader, desc="Training Batch")
     for step, batch in enumerate(batch_bar):
-        batch = {k:v.cuda() for k, v in batch.items()}
+        batch = {k: v.cuda() for k, v in batch.items()}
         with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
             output = model(**batch)
 
@@ -58,7 +63,15 @@ for epoch in tqdm(range(epochs), desc="Traning Epoch"):
                 data, label = val_dataset[i]
                 prefix = tokenizer.decode(data.tolist(), skip_special_tokens=True)
                 try:
-                    generate = model.generate(input_ids=data.unsqueeze(0).cuda(), temperature=0.7, top_k=50, do_sample=True, repetition_penalty=1.02, max_new_tokens=100, top_p=0.9)
+                    generate = model.generate(
+                        input_ids=data.unsqueeze(0).cuda(),
+                        temperature=0.7,
+                        top_k=50,
+                        do_sample=True,
+                        repetition_penalty=1.02,
+                        max_new_tokens=100,
+                        top_p=0.9,
+                    )
                     text = tokenizer.decode(generate[0].tolist(), skip_special_tokens=True)
                     text = text.replace(prefix, "")
                     fp.write(f"Prefix: {prefix}\nGenerated: {text}" + "\n---------------------------------\n")
diff --git a/tools/transformers/internlm_sft_on_moss.py b/tools/transformers/internlm_sft_on_moss.py
index 1638cf1..ef88523 100644
--- a/tools/transformers/internlm_sft_on_moss.py
+++ b/tools/transformers/internlm_sft_on_moss.py
@@ -1,9 +1,11 @@
-import os
 import copy
+import os
 
 import torch
+from datasets import Dataset as HFDataset
+from datasets import load_dataset
 from torch.utils.data import Dataset
-from datasets import load_dataset, Dataset as HFDataset
+
 
 class SFTDataset(Dataset):
     # https://github.com/OpenLMLab/MOSS/blob/main/finetune_moss.py
@@ -13,7 +15,7 @@ class SFTDataset(Dataset):
 
     def __len__(self):
         return len(self.dataset)
-    
+
     def __getitem__(self, index):
         data = copy.deepcopy(self.dataset[index]["input_ids"])
         no_loss_spans = copy.deepcopy(self.dataset[index]["no_loss_spans"])
@@ -25,22 +27,26 @@ class SFTDataset(Dataset):
             label[no_loss_span[0] : no_loss_span[1]] = -100
 
         return data, label
-    
+
+
 def collate_fn(batch, tokenizer):
     batch_input_ids, batch_labels = [], []
     for input_ids, label in batch:
         batch_input_ids.append(input_ids)
         batch_labels.append(label)
 
-    batch_input_ids = torch.nn.utils.rnn.pad_sequence(batch_input_ids, batch_first=True, padding_value=tokenizer.eos_token_id)
+    batch_input_ids = torch.nn.utils.rnn.pad_sequence(
+        batch_input_ids, batch_first=True, padding_value=tokenizer.eos_token_id
+    )
     batch_labels = torch.nn.utils.rnn.pad_sequence(batch_labels, batch_first=True, padding_value=-100)
 
     return {
         "input_ids": batch_input_ids,
         "attention_mask": (batch_input_ids == tokenizer.eos_token_id).long(),
-        "labels": batch_labels
+        "labels": batch_labels,
     }
 
+
 def process(sample, tokenizer, max_len):
     chat = sample["plain_text"].split("<eoa>")[:-1]
     num_turns = sample["num_turns"]
@@ -61,7 +67,7 @@ def process(sample, tokenizer, max_len):
         # Add to cur_turn_ids
         cur_turn_ids.extend(tokenizer.encode(chat[i] + "<eoa>"))
         # if key == 'Tool Responses':
-        #     # The format tokens (<|Results|>:...<eor>\n) should have losses. 
+        #     # The format tokens (<|Results|>:...<eor>\n) should have losses.
         #     cur_no_loss_spans.append((len(input_ids + cur_turn_ids) + 5, len(input_ids + cur_turn_ids + cur_ids) - 2))
         if len(input_ids + cur_turn_ids) > max_len:
             # Too long, break
@@ -81,20 +87,20 @@ def load_data(save_dir, tokenizer, max_len, num=-1) -> HFDataset:
     if os.path.exists(save_dir):
         print(f"Loading moss-002-sft from {save_dir}")
     else:
-        print(f"Loading moss-002-sft from datasets")
+        print("Loading moss-002-sft from datasets")
         moss_sft = load_dataset("fnlp/moss-002-sft-data", split="train")
-        moss_sft = moss_sft.map(lambda x:process(x, tokenizer, max_len), num_proc=10)
-        moss_sft = moss_sft.filter(lambda x:len(x["input_ids"]) != 0)
+        moss_sft = moss_sft.map(lambda x: process(x, tokenizer, max_len), num_proc=10)
+        moss_sft = moss_sft.filter(lambda x: len(x["input_ids"]) != 0)
         moss_sft.save_to_disk(save_dir)
 
     moss_sft = HFDataset.load_from_disk(save_dir)
     if num != -1:
         moss_sft = moss_sft.select(range(num))
-    print(
-        f"Load successfully, total {len(moss_sft)} samples.")
-    
+    print(f"Load successfully, total {len(moss_sft)} samples.")
+
     return moss_sft
 
+
 def get_dataset(tokenizer, save_dir, max_len=1024, num=-1, test_size=0.1):
     moss_sft_data = load_data(save_dir, tokenizer, max_len, num)
     moss_sft_split = moss_sft_data.train_test_split(test_size=test_size)
@@ -102,4 +108,3 @@ def get_dataset(tokenizer, save_dir, max_len=1024, num=-1, test_size=0.1):
     val_dataset = SFTDataset(moss_sft_split["test"])
 
     return train_dataset, val_dataset
-
diff --git a/tools/transformers/modeling_internlm.py b/tools/transformers/modeling_internlm.py
index da7aaa0..37f50d1 100644
--- a/tools/transformers/modeling_internlm.py
+++ b/tools/transformers/modeling_internlm.py
@@ -19,26 +19,35 @@
 # limitations under the License.
 """ PyTorch InternLM model."""
 import math
+import queue
+import threading
 from typing import List, Optional, Tuple, Union
-import threading, queue
 
 import torch
 import torch.utils.checkpoint
+from configuration_internlm import InternLMConfig
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
 from transformers.generation.streamers import BaseStreamer
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from configuration_internlm import InternLMConfig
-
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "InternLMConfig"
 
+
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
@@ -423,7 +432,7 @@ INTERNLM_INPUTS_DOCSTRING = r"""
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
+"""  # noqa: E501
 
 
 @add_start_docstrings(
@@ -437,6 +446,7 @@ class InternLMModel(InternLMPreTrainedModel):
     Args:
         config: InternLMConfig
     """
+
     _auto_class = "AutoModel"
 
     def __init__(self, config: InternLMConfig):
@@ -765,7 +775,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         for layer_past in past_key_values:
             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
-    
+
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
         prompt = ""
         for record in history:
@@ -774,43 +784,49 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
             prompt += "<s>"
         prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
         return tokenizer([prompt], return_tensors="pt")
-    
+
     @torch.no_grad()
-    def chat(self, 
-             tokenizer, 
-             query: str,
-             history: List[Tuple[str, str]] = [], 
-             streamer: Optional[BaseStreamer] = None,
-             max_new_tokens: int = 1024,
-             do_sample: bool = True,
-             temperature: float = 0.8,
-             top_p: float = 0.8,
-             **kwargs):
+    def chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = [],
+        streamer: Optional[BaseStreamer] = None,
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        **kwargs,
+    ):
         inputs = self.build_inputs(tokenizer, query, history)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
-        outputs = self.generate(**inputs, 
-                                streamer=streamer, 
-                                max_new_tokens=max_new_tokens, 
-                                do_sample=do_sample, 
-                                temperature=temperature, 
-                                top_p=top_p, 
-                                **kwargs)
-        outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]):]
+        outputs = self.generate(
+            **inputs,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            **kwargs,
+        )
+        outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
         response = response.split("<eoa>")[0]
         history = history + [(query, response)]
         return response, history
-    
+
     @torch.no_grad()
-    def stream_chat(self, 
-                    tokenizer,
-                    query: str,
-                    history: List[Tuple[str, str]] = [], 
-                    max_new_tokens: int = 1024,
-                    do_sample: bool = True,
-                    temperature: float = 0.8,
-                    top_p: float = 0.8,
-                    **kwargs):
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = [],
+        max_new_tokens: int = 1024,
+        do_sample: bool = True,
+        temperature: float = 0.8,
+        top_p: float = 0.8,
+        **kwargs,
+    ):
         """
         Return a generator in format: (response, history)
         Eg.
@@ -856,12 +872,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 tokenizer=tokenizer,
                 query=query,
                 streamer=ChatStreamer(tokenizer=tokenizer),
-                history=history, 
+                history=history,
                 max_new_tokens=max_new_tokens,
                 do_sample=do_sample,
                 temperature=temperature,
                 top_p=top_p,
-                **kwargs
+                **kwargs,
             )
 
         def consumer():
diff --git a/tools/transformers/tokenization_internlm.py b/tools/transformers/tokenization_internlm.py
index 5ce1e66..2e1b114 100644
--- a/tools/transformers/tokenization_internlm.py
+++ b/tools/transformers/tokenization_internlm.py
@@ -24,11 +24,9 @@ from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
 import sentencepiece as spm
-
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.utils import logging
 
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
@@ -239,4 +237,4 @@ class InternLMTokenizer(PreTrainedTokenizer):
 
         if token_ids_1 is None:
             return len(token_ids_0 + eos) * [0]
-        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
\ No newline at end of file
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]