From e32fbaaae2a817a82bc19c57c4a291dd73c31253 Mon Sep 17 00:00:00 2001
From: Wenwen Qu <vinny_qu@163.com>
Date: Thu, 24 Aug 2023 16:40:11 +0800
Subject: [PATCH] Update 7B_sft.py

---
 configs/7B_sft.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index d062040..eba77c8 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -128,6 +128,18 @@ model = dict(
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
     num_experts=8,
 )
+"""
+zero1 parallel:
+    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+        so parameters will be divided within the range of dp.
+    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+tensor parallel: tensor parallel size, usually the number of GPUs per node.
+"""
 parallel = dict(
     zero1=8,
     pipeline=dict(size=1, interleaved_overlap=True),