From 6150e4daedd5b743c95a50862fe80e8062a40167 Mon Sep 17 00:00:00 2001 From: Sun Peng Date: Wed, 12 Jul 2023 18:59:31 +0800 Subject: [PATCH] fix/fix_submodule_err (#61) * fix/fix_submodule_err --------- Co-authored-by: ChenQiaoling00 --- doc/en/install.md | 3 ++- doc/en/usage.md | 4 ++-- doc/install.md | 3 ++- doc/usage.md | 2 +- third_party/apex | 2 +- third_party/flash-attention | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/doc/en/install.md b/doc/en/install.md index 5ff70fb..2799449 100644 --- a/doc/en/install.md +++ b/doc/en/install.md @@ -8,7 +8,8 @@ The required packages and corresponding version are shown as follows: - CUDA == 11.7 - Pytorch == 1.13.1+cu117 - Transformers >= 4.25.1 -- Flash-Attention == 23.05 +- Flash-Attention == v1.0.5 +- Apex == 23.05 - GPU with Ampere or Hopper architecture (such as H100, A100) - Linux OS diff --git a/doc/en/usage.md b/doc/en/usage.md index d6adf9f..b0fec39 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -192,7 +192,7 @@ $ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python trai If you want to start distributed training on torch with 8 GPUs on a single node, use the following command: ```bash -$ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py +$ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py --launcher "torch" ``` ### Training Results @@ -217,4 +217,4 @@ Taking the configuration of the demo training on a single machine with 8 GPUs on 2023-07-07 12:29:09,307 INFO train.py:323 in record_current_batch_training_metrics -- tflops=188.8613541410694,step=3,loss=11.099515914916992,tgs (tokens/gpu/second)=4252.55,lr=1.0000000000000002e-06,loss_scale=65536.0,grad_norm=63.5478796484391,micro_num=4,num_consumed_tokens=524288,inf_nan_skip_batches=0,num_samples_in_batch=16,largest_length=2048,largest_batch=5,smallest_batch=3,adam_beta2=0.95,fwd_bwd_time=3.7 2023-07-07 12:29:13,147 INFO train.py:323 in record_current_batch_training_metrics -- tflops=189.65918563194305,step=4,loss=10.149517059326172,tgs (tokens/gpu/second)=4270.52,lr=1.2000000000000002e-06,loss_scale=65536.0,grad_norm=51.582841631508145,micro_num=4,num_consumed_tokens=655360,inf_nan_skip_batches=0,num_samples_in_batch=19,largest_length=2048,largest_batch=6,smallest_batch=3,adam_beta2=0.95,fwd_bwd_time=3.68 2023-07-07 12:29:16,994 INFO train.py:323 in record_current_batch_training_metrics -- tflops=189.3109313713174,step=5,loss=9.822169303894043,tgs (tokens/gpu/second)=4262.67,lr=1.4000000000000001e-06,loss_scale=65536.0,grad_norm=47.10386835560855,micro_num=4,num_consumed_tokens=786432,inf_nan_skip_batches=0,num_samples_in_batch=17,largest_length=2048,largest_batch=6,smallest_batch=3,adam_beta2=0.95,fwd_bwd_time=3.69 -``` \ No newline at end of file +``` diff --git a/doc/install.md b/doc/install.md index c0d6434..c8eceeb 100644 --- a/doc/install.md +++ b/doc/install.md @@ -8,7 +8,8 @@ - CUDA == 11.7 - Pytorch == 1.13.1+cu117 - Transformers >= 4.25.1 -- Flash-Attention == 23.05 +- Flash-Attention == v1.0.5 +- Apex == 23.05 - Ampere或者Hopper架构的GPU (例如H100, A100) - Linux OS diff --git a/doc/usage.md b/doc/usage.md index d6e85ae..ef173ac 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -175,7 +175,7 @@ $ srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python trai 若在 torch 上启动分布式运行环境,单节点 8 卡的运行命令如下所示: ```bash -$ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py +$ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py --launcher "torch" ``` ### 运行结果 diff --git a/third_party/apex b/third_party/apex index 8ffc901..0da3ffb 160000 --- a/third_party/apex +++ b/third_party/apex @@ -1 +1 @@ -Subproject commit 8ffc901e50bbf740fdb6d5bccb17f66a6ec8604e +Subproject commit 0da3ffb92ee6fbe5336602f0e3989db1cd16f880 diff --git a/third_party/flash-attention b/third_party/flash-attention index d2f4324..eff9fe6 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit d2f4324f4c56e017fbf22dc421943793a8ca6c3b +Subproject commit eff9fe6b8076df59d64d7a3f464696738a3c7c24