From f5aea7e08cb8a376d53b6da6884634b60245aa69 Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Tue, 21 Nov 2023 19:19:22 +0800 Subject: [PATCH 1/2] fix(timeout): larger timeout (#495) * larger initialize timeout * unify time format * update timeout thresholds --- internlm/utils/common.py | 2 +- internlm/utils/timeout.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internlm/utils/common.py b/internlm/utils/common.py index 6d7f7b2..6c9cc68 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs): def launch_time(): global CURRENT_TIME if not CURRENT_TIME: - CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S") + CURRENT_TIME = datetime.now().strftime("%m-%d-%H:%M:%S") return CURRENT_TIME diff --git a/internlm/utils/timeout.py b/internlm/utils/timeout.py index 4e68ce9..711c6da 100644 --- a/internlm/utils/timeout.py +++ b/internlm/utils/timeout.py @@ -39,14 +39,14 @@ ENABLE_TIMEOUT = os.getenv("INTERNLM_ENABLE_TIMEOUT", None) timeout_threshold_dict = { - "initialize_distributed_env": 120, + "initialize_distributed_env": 240, "nopp_forward_backward_step": 360, - "initialize_model": 10, - "initialize_optimizer": 20, - "optim_step": 30, + "initialize_model": 60, + "initialize_optimizer": 60, + "optim_step": 60, "get_train_data_loader": 600, - "get_validation_data_loader": 60, - "load_new_batch": 10, + "get_validation_data_loader": 120, + "load_new_batch": 20, "record_current_batch_training_metrics": 10, "save_checkpoint": 1200, "interleaved_forward_backward_step": 600, From 77766933730388101ede10797f5699698c04897a Mon Sep 17 00:00:00 2001 From: jiaxingli <43110891+li126com@users.noreply.github.com> Date: Tue, 21 Nov 2023 19:20:02 +0800 Subject: [PATCH 2/2] feat(doc): add GPU memory info for 7B & 20B models (#507) * unitest_only_forward * memory_test * doc fix * doc fix --- doc/en/train_performance.md | 64 +++++++++++++++++++++++++++++++++++++ doc/train_performance.md | 63 ++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/doc/en/train_performance.md b/doc/en/train_performance.md index 9c77d9e..91b87de 100644 --- a/doc/en/train_performance.md +++ b/doc/en/train_performance.md @@ -91,3 +91,67 @@ When `Activation Ckpt` is turned off, the test results are as shown in the table + +### GPU Memory Usage Test +Test configuration: +| Configuration | Description | +| :-------: | :-----: | +| branch | develop | +| tag | v0.2.1dev20231121 | +| GPU | A800 | +| Checkpoint| True | +| micro_bsz | 1 | +| micro_num | 4 | +| dtype | bfloat16| + +```python +# InternLM/configs/7B_sft.py +data = dict( + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + ... +) + +model = dict( + checkpoint=True, + dtype="torch.bfloat16", + ... +) + +parallel = dict( + zero1=dict(size=8, fsdp=False), + tensor=1, + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=False, +) +``` + +Pre-training & Fine-tuning test: +|model|Number of GPU|zero1|tp|pp|fsdp|GPU Memory (GB)| +|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| 7B | 3 | -1 | 1 | 3 |False| 75 | +| 7B | 3 | -1 | 1 | 1 |True | 72 | +| 7B | 4 | -1 | 4 | 1 |True | 52 | +| 7B | 4 | -1 | 4 | 1 |False| 61 | +| 7B | 4 | -1 | 1 | 4 |False| 69 | +| 7B | 4 | -1 | 1 | 1 |True | 56 | +| 7B | 5 | -1 | 1 | 1 |True | 49 | +| 7B | 5 | -1 | 1 | 5 |False| 62 | +| 7B | 6 | -1 | 1 | 1 |True | 39 | +| 7B | 6 | -1 | 2 | 1 |True | 38 | +| 7B | 6 | -1 | 1 | 6 |False| 56 | +| 20B | 8 | -1 | 1 | 1 |True | 78 | +| 20B | 8 | -1 | 8 | 1 |True | 71 | +| 20B | 16 | -1 | 1 | 1 |True | 40 | +| 20B | 16 | -1 | 8 | 1 |True | 39 | +| 20B | 16 | -1 | 1 | 16 |False| 52 | + + +Web_demo test: + +|model|GPU|GPU Memory (GB)|System Memory (MB)| +|:-:|:-:|:-:|:-:| +| 7B | A800 | 14.5 | 2465 | +| 20B | A800 | 39 | 9547 | diff --git a/doc/train_performance.md b/doc/train_performance.md index 239e20f..c5e2a7c 100644 --- a/doc/train_performance.md +++ b/doc/train_performance.md @@ -88,3 +88,66 @@ InternLM中`zero1`的配置决定了优化器状态的分配范围。 +### 显存占用测试 +测试配置: +| 配置 | 描述 | +| :-------: | :-----: | +| 分支 | develop | +| tag | v0.2.1dev20231121 | +| 显卡 | A800 | +| 重计算 | True | +| micro_bsz | 1 | +| micro_num | 4 | +| dtype | bfloat16| + +```python +# InternLM/configs/7B_sft.py +data = dict( + # micro_num means the number of micro_batch contained in one gradient update + micro_num=4, + # packed_length = micro_bsz * SEQ_LEN + micro_bsz=1, + ... +) + +model = dict( + checkpoint=True, + dtype="torch.bfloat16", + ... +) + +parallel = dict( + zero1=dict(size=8, fsdp=False), + tensor=1, + pipeline=dict(size=1, interleaved_overlap=True), + sequence_parallel=False, +) +``` + +预训练和微调测试: +|模型|卡数|zero1|tp|pp|fsdp|显存(GB)| +|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| 7B | 3 | -1 | 1 | 3 |False| 75 | +| 7B | 3 | -1 | 1 | 1 |True | 72 | +| 7B | 4 | -1 | 4 | 1 |True | 52 | +| 7B | 4 | -1 | 4 | 1 |False| 61 | +| 7B | 4 | -1 | 1 | 4 |False| 69 | +| 7B | 4 | -1 | 1 | 1 |True | 56 | +| 7B | 5 | -1 | 1 | 1 |True | 49 | +| 7B | 5 | -1 | 1 | 5 |False| 62 | +| 7B | 6 | -1 | 1 | 1 |True | 39 | +| 7B | 6 | -1 | 2 | 1 |True | 38 | +| 7B | 6 | -1 | 1 | 6 |False| 56 | +| 20B | 8 | -1 | 1 | 1 |True | 78 | +| 20B | 8 | -1 | 8 | 1 |True | 71 | +| 20B | 16 | -1 | 1 | 1 |True | 40 | +| 20B | 16 | -1 | 8 | 1 |True | 39 | +| 20B | 16 | -1 | 1 | 16 |False| 52 | + + +Web_demo 测试: + +|模型|显卡|显存(GB)|内存(MB)| +|:-:|:-:|:-:|:-:| +| 7B | A800 | 14.5 | 2465 | +| 20B | A800 | 39 | 9547 | \ No newline at end of file