From f5aea7e08cb8a376d53b6da6884634b60245aa69 Mon Sep 17 00:00:00 2001
From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com>
Date: Tue, 21 Nov 2023 19:19:22 +0800
Subject: [PATCH 1/2] fix(timeout): larger timeout (#495)

* larger initialize timeout

* unify time format

* update timeout thresholds
---
 internlm/utils/common.py  |  2 +-
 internlm/utils/timeout.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/internlm/utils/common.py b/internlm/utils/common.py
index 6d7f7b2..6c9cc68 100644
--- a/internlm/utils/common.py
+++ b/internlm/utils/common.py
@@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs):
 def launch_time():
     global CURRENT_TIME
     if not CURRENT_TIME:
-        CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S")
+        CURRENT_TIME = datetime.now().strftime("%m-%d-%H:%M:%S")
     return CURRENT_TIME
 
 
diff --git a/internlm/utils/timeout.py b/internlm/utils/timeout.py
index 4e68ce9..711c6da 100644
--- a/internlm/utils/timeout.py
+++ b/internlm/utils/timeout.py
@@ -39,14 +39,14 @@ ENABLE_TIMEOUT = os.getenv("INTERNLM_ENABLE_TIMEOUT", None)
 
 
 timeout_threshold_dict = {
-    "initialize_distributed_env": 120,
+    "initialize_distributed_env": 240,
     "nopp_forward_backward_step": 360,
-    "initialize_model": 10,
-    "initialize_optimizer": 20,
-    "optim_step": 30,
+    "initialize_model": 60,
+    "initialize_optimizer": 60,
+    "optim_step": 60,
     "get_train_data_loader": 600,
-    "get_validation_data_loader": 60,
-    "load_new_batch": 10,
+    "get_validation_data_loader": 120,
+    "load_new_batch": 20,
     "record_current_batch_training_metrics": 10,
     "save_checkpoint": 1200,
     "interleaved_forward_backward_step": 600,

From 77766933730388101ede10797f5699698c04897a Mon Sep 17 00:00:00 2001
From: jiaxingli <43110891+li126com@users.noreply.github.com>
Date: Tue, 21 Nov 2023 19:20:02 +0800
Subject: [PATCH 2/2] feat(doc): add GPU memory info for 7B & 20B models (#507)

* unitest_only_forward

* memory_test

* doc fix

* doc fix
---
 doc/en/train_performance.md | 64 +++++++++++++++++++++++++++++++++++++
 doc/train_performance.md    | 63 ++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/doc/en/train_performance.md b/doc/en/train_performance.md
index 9c77d9e..91b87de 100644
--- a/doc/en/train_performance.md
+++ b/doc/en/train_performance.md
@@ -91,3 +91,67 @@ When `Activation Ckpt` is turned off, the test results are as shown in the table
     <img src="../imgs/flops.png" width="580"/>
 </div>
 
+
+### GPU Memory Usage Test
+Test configuration:
+| Configuration | Description |
+| :-------: | :-----: |
+| branch    | develop |
+| tag       | v0.2.1dev20231121 |
+| GPU       | A800    |
+| Checkpoint| True    |
+| micro_bsz | 1       |
+| micro_num | 4       |
+| dtype     | bfloat16|
+
+```python
+# InternLM/configs/7B_sft.py
+data = dict(
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    ...
+)
+
+model = dict(
+    checkpoint=True,
+    dtype="torch.bfloat16",
+    ...
+)
+
+parallel = dict(
+    zero1=dict(size=8, fsdp=False),
+    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
+```
+
+Pre-training & Fine-tuning test:
+|model|Number of GPU|zero1|tp|pp|fsdp|GPU Memory (GB)|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+| 7B | 3 | -1 | 1 | 3 |False| 75 |
+| 7B | 3 | -1 | 1 | 1 |True | 72 |
+| 7B | 4 | -1 | 4 | 1 |True | 52 |
+| 7B | 4 | -1 | 4 | 1 |False| 61 |
+| 7B | 4 | -1 | 1 | 4 |False| 69 |
+| 7B | 4 | -1 | 1 | 1 |True | 56 |
+| 7B | 5 | -1 | 1 | 1 |True | 49 |
+| 7B | 5 | -1 | 1 | 5 |False| 62 |
+| 7B | 6 | -1 | 1 | 1 |True | 39 |
+| 7B | 6 | -1 | 2 | 1 |True | 38 |
+| 7B | 6 | -1 | 1 | 6 |False| 56 |
+| 20B | 8 | -1 | 1 | 1 |True | 78 |
+| 20B | 8 | -1 | 8 | 1 |True | 71 |
+| 20B | 16 | -1 | 1 | 1 |True | 40 |
+| 20B | 16 | -1 | 8 | 1 |True | 39 |
+| 20B | 16 | -1 | 1 | 16 |False| 52 |
+
+
+Web_demo test:
+
+|model|GPU|GPU Memory (GB)|System Memory (MB)|
+|:-:|:-:|:-:|:-:|
+| 7B | A800 | 14.5 | 2465 |
+| 20B | A800 | 39 | 9547 |
diff --git a/doc/train_performance.md b/doc/train_performance.md
index 239e20f..c5e2a7c 100644
--- a/doc/train_performance.md
+++ b/doc/train_performance.md
@@ -88,3 +88,66 @@ InternLM中`zero1`的配置决定了优化器状态的分配范围。
     <img src="../doc/imgs/flops.png" width="580"/>
 </div>
 
+### 显存占用测试
+测试配置：
+| 配置      | 描述    |
+| :-------: | :-----: |
+| 分支      | develop |
+| tag       | v0.2.1dev20231121 |
+| 显卡      | A800    |
+| 重计算    | True    |
+| micro_bsz | 1       |
+| micro_num | 4       |
+| dtype     | bfloat16|
+
+```python
+# InternLM/configs/7B_sft.py
+data = dict(
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    ...
+)
+
+model = dict(
+    checkpoint=True,
+    dtype="torch.bfloat16",
+    ...
+)
+
+parallel = dict(
+    zero1=dict(size=8, fsdp=False),
+    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
+```
+
+预训练和微调测试：
+|模型|卡数|zero1|tp|pp|fsdp|显存（GB）|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+| 7B | 3 | -1 | 1 | 3 |False| 75 |
+| 7B | 3 | -1 | 1 | 1 |True | 72 |
+| 7B | 4 | -1 | 4 | 1 |True | 52 |
+| 7B | 4 | -1 | 4 | 1 |False| 61 |
+| 7B | 4 | -1 | 1 | 4 |False| 69 |
+| 7B | 4 | -1 | 1 | 1 |True | 56 |
+| 7B | 5 | -1 | 1 | 1 |True | 49 |
+| 7B | 5 | -1 | 1 | 5 |False| 62 |
+| 7B | 6 | -1 | 1 | 1 |True | 39 |
+| 7B | 6 | -1 | 2 | 1 |True | 38 |
+| 7B | 6 | -1 | 1 | 6 |False| 56 |
+| 20B | 8 | -1 | 1 | 1 |True | 78 |
+| 20B | 8 | -1 | 8 | 1 |True | 71 |
+| 20B | 16 | -1 | 1 | 1 |True | 40 |
+| 20B | 16 | -1 | 8 | 1 |True | 39 |
+| 20B | 16 | -1 | 1 | 16 |False| 52 |
+
+
+Web_demo 测试:
+
+|模型|显卡|显存（GB）|内存（MB）|
+|:-:|:-:|:-:|:-:|
+| 7B | A800 | 14.5 | 2465 |
+| 20B | A800 | 39 | 9547 |
\ No newline at end of file