support reduce scatter memory pool

2023-10-20 10:35:45 +08:00 · 2023-10-20 10:35:45 +08:00 · ed7232777a
parent 4742271154
commit ed7232777a
7 changed files with 74 additions and 17 deletions
--- a/configs/20B_sft.py
+++ b/configs/20B_sft.py
@ -57,7 +57,7 @@ data = dict(
    # defaults to 0, means disable evaluate
    valid_every=50,
    pack_sample_into_one=False,
-    total_steps=50,
+    total_steps=20,
    skip_batches="",
    rampup_batch_size="",
    # Datasets with less than 50 rows will be discarded
--- a/configs/30B_sft.py
+++ b/configs/30B_sft.py
@ -5,7 +5,7 @@ SEQ_LEN = 4096
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
-NUM_LAYER = 40
+NUM_LAYER = 60
 VOCAB_SIZE = 103168

 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
@ -51,7 +51,7 @@ data = dict(
    # micro_num means the number of micro_batch contained in one gradient update
    micro_num=4,
    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=4,
+    micro_bsz=2,
    # defaults to the value of micro_num
    valid_micro_num=4,
    # defaults to 0, means disable evaluate
@ -161,8 +161,8 @@ pipeline parallel (dict):
 sequence parallel (bool): enable/disable sequence parallel, defaults to False.
 """
 parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="origin_tp", overlap=False),
+    zero1=dict(size=4, fsdp=False),
+    tensor=dict(size=8, mode="fstp", overlap=True),
    pipeline=dict(size=1, interleaved_overlap=True),
    sequence_parallel=True,
 )
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@ -162,7 +162,7 @@ sequence parallel (bool): enable/disable sequence parallel, defaults to False.
 """
 parallel = dict(
    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, mode="fstp"),
+    tensor=dict(size=8, mode="fstp", overlap=True),
    pipeline=dict(size=1, interleaved_overlap=True),
    sequence_parallel=True,
 )
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@ -14,6 +14,7 @@ from torch.distributed import ProcessGroup
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.utils.logger import get_logger
+from internlm.utils.common import get_current_device

 logger = get_logger(__file__)

@ -148,6 +149,18 @@ def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bo
                                                     async_op=async_op)
    return output, handle

+def reduce_scatter_raw_memory_pool(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+    world_size = torch.distributed.get_world_size(process_group)
+    assert input_.shape[0] % world_size == 0
+    size = (input_.shape[0] // world_size, *input_.shape[1:])
+    index = check_reduce_scatter_memory_pool(size)
+    output = gpc.config.reduce_scatter_memory[size]['data'][index]
+    setattr(output, "index", index)
+    handle = torch.distributed.reduce_scatter_tensor(output, input_.contiguous(),
+                                                     group=process_group,
+                                                     async_op=async_op)
+    return output, handle
+

 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
 class FusedDenseFunc(torch.autograd.Function):
@ -404,12 +417,13 @@ class FSTPFusedDenseFunc(torch.autograd.Function):
                    #     assert hasattr(bias, "_fstp_all_reduce_str")
                    #     all_gather_handler.all_reduce_handlers[bias._fstp_all_reduce_str] = (handle_grad_bias, grad_bias_async)
                    #     grad_bias = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
-                    grad_weight_async, handle_grad_weight = reduce_scatter_raw(grad_weight, process_group, async_op=True)
+                    
+                    grad_weight_async, handle_grad_weight = reduce_scatter_raw_memory_pool(grad_weight, process_group, async_op=True)
                    assert hasattr(weight, "_fstp_reduce_scatter_str")
                    all_gather_handler.reduce_scatter_handlers[weight._fstp_reduce_scatter_str] = (handle_grad_weight, grad_weight_async)
                    grad_weight = all_gather_handler.get_zero_by_shape((grad_weight.shape[0]//torch.distributed.get_world_size(process_group), *grad_weight.shape[1:]), dtype=grad_weight.dtype, device=grad_weight.device)
                    if grad_bias is not None:
-                        grad_bias_async, handle_grad_bias = reduce_scatter_raw(grad_bias, process_group, async_op=True)
+                        grad_bias_async, handle_grad_bias = reduce_scatter_raw_memory_pool(grad_bias, process_group, async_op=True)
                        assert hasattr(bias, "_fstp_reduce_scatter_str")
                        all_gather_handler.reduce_scatter_handlers[bias._fstp_reduce_scatter_str] = (handle_grad_bias, grad_bias_async)
                        grad_bias = all_gather_handler.get_zero_by_shape((grad_bias.shape[0]//torch.distributed.get_world_size(process_group), *grad_bias.shape[1:]), dtype=grad_bias.dtype, device=grad_bias.device)
@ -521,3 +535,37 @@ def Silu(w1_o, w2_o):


 Silu = torch.jit.script(Silu)
+
+def check_reduce_scatter_memory_pool(key):
+    
+    return_idx = 0
+    
+    # if key not in dict
+    if key not in gpc.config.reduce_scatter_memory:
+        gpc.config.reduce_scatter_memory[key] = {'data': [], 'used': []}
+    
+    # if the data is empty
+    if len(gpc.config.reduce_scatter_memory[key]['data']) == 0:
+        gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, 
+                                                             dtype=gpc.config.model.get("dtype", torch.half), 
+                                                             device=get_current_device()).contiguous())
+        gpc.config.reduce_scatter_memory[key]['used'].append(True)
+        return_idx = 0
+        return return_idx
+    else: # if not empty
+        for index, used in enumerate(gpc.config.reduce_scatter_memory[key]['used']):
+            if used == False:
+                gpc.config.reduce_scatter_memory[key]['used'][index] = True
+                return_idx = index
+                return return_idx
+        # if the memory pool is all used
+        length = len(gpc.config.reduce_scatter_memory[key]['data'])
+        gpc.config.reduce_scatter_memory[key]['data'].append(torch.zeros(key, 
+                                                             dtype=gpc.config.model.get("dtype", torch.half), 
+                                                             device=get_current_device()).contiguous())
+        gpc.config.reduce_scatter_memory[key]['used'].append(True)
+        return_idx = length
+        return return_idx
+
+def release_reduce_scatter_memory_pool(size, index):
+    gpc.config.reduce_scatter_memory[size]['used'][index] = False
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -10,7 +10,7 @@ from torch.optim import Optimizer

 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import split_forward_gather_backward
+from internlm.model.utils import split_forward_gather_backward, release_reduce_scatter_memory_pool
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
    BucketStore,
@ -353,7 +353,8 @@ class HybridZeroOptimizer(BaseOptimizer):
                    comm_handle.wait()
                    _param.grad.add_(_grad)
                    # self._fstp_handler.reduce_scatter_handlers[key] = None
-                    del _grad
+                    # del _grad
+                    release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
                    del self._fstp_handler.reduce_scatter_handlers[key]
                    self._fstp_handler.reduce_scatter_handlers[key] = None
                    assert key in self._fstp_handler.reduce_scatter_handlers
@ -395,7 +396,8 @@ class HybridZeroOptimizer(BaseOptimizer):
                comm_handle.wait()
                _param.grad.add_(_grad)
                # self._fstp_handler.reduce_scatter_handlers[key] = None
-                del _grad
+                # del _grad
+                release_reduce_scatter_memory_pool(size=tuple(_grad.size()),index=_grad.index)
                del self._fstp_handler.reduce_scatter_handlers[key]
                self._fstp_handler.reduce_scatter_handlers[key] = None
                assert key in self._fstp_handler.reduce_scatter_handlers
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -51,7 +51,7 @@ from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
 from internlm.solver.optimizer import FSDPadaptOptimizer, HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
 from internlm.train.utils import create_param_groups
-from internlm.utils.common import DummyProfile
+from internlm.utils.common import DummyProfile, get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import sync_model_param, sync_model_param_within_tp
@ -123,7 +123,8 @@ def initialize_model():
        mlp_ratio = gpc.config.MLP_RATIO
        mlp_hidden_size = int(hidden_size * mlp_ratio)
        mlp_hidden_size = 256 * ((mlp_hidden_size + 256 - 1) // 256)
-        size_key = [(3 * hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (mlp_hidden_size, hidden_size), (hidden_size, hidden_size)]
+        world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        size_key = [(3 * hidden_size // world_size, hidden_size), (mlp_hidden_size // world_size, hidden_size), (hidden_size // world_size, mlp_hidden_size), (hidden_size // world_size, hidden_size)]
        module_name = ['Wqkv', 'out_proj', 'w1', 'w2', 'w3']
        for i in range(2):
            weight = {}
@ -131,21 +132,26 @@ def initialize_model():
                if name == 'Wqkv':
                    weight[name] = torch.zeros((3 * hidden_size, hidden_size), 
                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
                elif name == 'out_proj':
                    weight[name] = torch.zeros((hidden_size, hidden_size), 
                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
                elif name == 'w1' or name == 'w2':
                    weight[name] = torch.zeros((mlp_hidden_size, hidden_size), 
                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
                else:
                    weight[name] = torch.zeros((hidden_size, mlp_hidden_size), 
                                               dtype=gpc.config.model.get("dtype", torch.half), 
-                                               device='cuda').contiguous()
+                                               device=get_current_device()).contiguous()
            block_memory[i] = weight
+        reduce_scatter_memory = {}
+        for key in size_key:
+            reduce_scatter_memory[key] = {'data': [], 'used': []}
+        
        gpc.config.block_memory = block_memory
+        gpc.config.reduce_scatter_memory = reduce_scatter_memory

    return model

--- a/train.py
+++ b/train.py
@ -299,6 +299,7 @@ def main(args):
            
            if gpc.config.fstp_handler is not None:
                gpc.config.fstp_handler.zero_const_pool = {}
+                gpc.config.fstp_handler.reduce_scatter_memory = {}
            torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
            torch.cuda.reset_peak_memory_stats()