feat(model/overlap_handler.py): fix lint error

2023-10-23 15:22:03 +08:00 · 2023-10-23 15:22:03 +08:00 · 0d693cf3a1
parent f6a5086fe4
commit 0d693cf3a1
4 changed files with 26 additions and 19 deletions
--- a/internlm/model/moe.py
+++ b/internlm/model/moe.py
@ -53,7 +53,6 @@ class MoE(torch.nn.Module):
        device=None,
        dtype=None,
    ):
        super().__init__()
        assert (
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@ -10,7 +10,10 @@ from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import FSTPLinear, ScaleColumnParallelLinear
-from internlm.model.utils import all_gather_raw_memory_pool, all_gather_raw_bias_memory_pool
+from internlm.model.utils import (
    all_gather_raw_bias_memory_pool,
    all_gather_raw_memory_pool,
 )
 from internlm.utils.common import get_current_device
@ -25,7 +28,7 @@ class FSTPOverlapHandler:
        self.fstp_modules = []
        self.module_name = ["Wqkv", "out_proj", "w1", "w2", "w3"]
        self.fstp_global_handle = dict()  # key: fstp module; value: module global all-gather op handle
-        self.bias_global_handle = dict() # key: fstp module; value: module bias global all-gather op handle
+        self.bias_global_handle = dict()  # key: fstp module; value: module bias global all-gather op handle
        self.module_to_index = dict()  # key: fstp module; value: transformer block index
        self.index_to_fstp_modules = dict()  # key: transformer block index; value: fsdp modules
        self.head = []
@ -107,6 +110,10 @@ class FSTPOverlapHandler:
                weight[name] = torch.zeros(self.module_shape[name], dtype=dtype, device=device).contiguous()
            self.all_gather_memory_pool.append(weight)  # containing two groups of block weight
    def clear_memory_pool(self) -> None:
        self.zero_const_pool = {}
        self.reduce_scatter_memory_pool = {}
    def get_all_gather_memory(self, module):
        block_index = self.module_to_index[module]
        return self.all_gather_memory_pool[block_index % 2][module._fstp_name]
@ -119,20 +126,21 @@ class FSTPOverlapHandler:
            for _ in range(2):
                weight = {}
                weight[module._fstp_name] = torch.zeros(
-                                                    self.module_shape[module._fstp_name][0], 
+                    self.module_shape[module._fstp_name][0],
-                                                    dtype=gpc.config.model.get("dtype", torch.half),
+                    dtype=gpc.config.model.get("dtype", torch.half),
-                                                    device=get_current_device()).contiguous()
+                    device=get_current_device(),
                ).contiguous()
                self.all_gather_bias_memory_pool.append(weight)
        elif module._fstp_name not in self.all_gather_bias_memory_pool[0]:
            for i in range(2):
                self.all_gather_bias_memory_pool[i][module._fstp_name] = torch.zeros(
-                                                    self.module_shape[module._fstp_name][0], 
+                    self.module_shape[module._fstp_name][0],
-                                                    dtype=gpc.config.model.get("dtype", torch.half),
+                    dtype=gpc.config.model.get("dtype", torch.half),
-                                                    device=get_current_device()).contiguous()
+                    device=get_current_device(),
                ).contiguous()
        return self.all_gather_bias_memory_pool[block_index % 2][module._fstp_name]
    def get_reduce_scatter_memory(self, key):
        return_idx = 0
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@ -140,6 +140,7 @@ def all_gather_raw_memory_pool(
    )
    return handle
 def all_gather_raw_bias_memory_pool(
    input_: Tensor,
    process_group: ProcessGroup,
--- a/train.py
+++ b/train.py
@ -298,8 +298,7 @@ def main(args):
            prof.step()
            if gpc.fstp_handler is not None:
-                gpc.fstp_handler.zero_const_pool = {}
+                gpc.fstp_handler.clear_memory_pool()
                gpc.fstp_handler.reduce_scatter_memory_pool = {}
            # torch.cuda.memory._dump_snapshot(f"my_snapshot_{gpc.get_global_rank()}.pickle")
            torch.cuda.reset_peak_memory_stats()