set layer name to parameters after init_model

2023-10-14 22:32:10 +08:00 · 2023-10-14 22:32:10 +08:00 · 7d68509c4f
parent 646f1b45fa
commit 7d68509c4f
4 changed files with 39 additions and 14 deletions
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@ -150,6 +150,7 @@ class ParallelContext(metaclass=SingletonMeta):
        self.virtual_pipeline_parallel_size = None
        self.virtual_pipeline_parallel_rank = None
        self._expert_parallel_group_names = []
        self.layer_names = {"unknown", "embedding", "norm", "head"}
    @property
    def config(self):
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 import copy
 import math
 from abc import ABC, abstractmethod
 from collections import OrderedDict
@ -32,7 +31,6 @@ except (ModuleNotFoundError, ImportError):
    APEX_AVAILABLE = False
 inf = math.inf
 global_layer_norms = {"unknown": 0.0, "embedding": 0.0, "norm": 0.0, "head": 0.0}
 def flatten(input_):
@ -228,7 +226,7 @@ def compute_norm(
    enable_cuda_kernels = gradients[0].device.type == "cuda"
    # Norm parameters.
    norm_type = float(norm_type)
-    total_layer_norms = copy.deepcopy(global_layer_norms)
+    total_layer_norms = {layer_name: 0.0 for layer_name in gpc.layer_names}
    layer_grads = {}
    # Calculate norm.
    if norm_type == inf:
@ -249,7 +247,7 @@ def compute_norm(
                total_layer_norms[key] = max(value, total_layer_norms[key])
        total_layer_norms_values = move_norm_to_cuda(torch.Tensor(list(total_layer_norms.values())))
-        total_layer_norms_keys = list(global_layer_norms.keys())
+        total_layer_norms_keys = list(total_layer_norms.keys())
        # Take max across all model-parallel GPUs.
        if gpc.is_initialized(ParallelMode.MODEL):
@ -523,24 +521,17 @@ class ParamBcastSyncHandler:
        for _chunk in model:
            if isinstance(_chunk, NaiveAMPModel):
                _chunk = _chunk.model
-            for name, children in _chunk.named_children():
+            for _, children in _chunk.named_children():
                # should be the transformer block definaton in modeling_xxx.py
                if isinstance(children, nn.ModuleList):
                    # record the block that a parameter belongs to
-                    for idx, block in enumerate(children):
+                    for _, block in enumerate(children):
                        # self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
                        self._block_to_param[block] = list(block.parameters())
                        for parameter in self._block_to_param[block]:
                            layer_name = f"{block.__class__.__name__}.{idx}"
                            global_layer_norms[layer_name] = 0.0
                            parameter.__setattr__("layer_name", layer_name)
                else:
                    # record the block that a parameter belongs to
                    # self._block_to_param[name] = list(children.parameters())
                    self._block_to_param[children] = list(children.parameters())
                    for parameter in self._block_to_param[children]:
                        layer_name = f"{children.__class__.__name__}"
                        parameter.__setattr__("layer_name", name)
        alloc_num = 0
        rank_to_go = 0
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -52,7 +52,11 @@ from internlm.train.utils import create_param_groups
 from internlm.utils.common import DummyProfile
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
-from internlm.utils.parallel import sync_model_param, sync_model_param_within_tp
+from internlm.utils.parallel import (
    set_model_params_layer_name,
    sync_model_param,
    sync_model_param_within_tp,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.timeout import llm_timeout
@ -107,6 +111,9 @@ def initialize_model():
    # if fsdp enabled, wrap the model
    model = wrap_FSDP_model(model)
    # set the layer name as an attribute of the model parameters
    set_model_params_layer_name(model)
    return model
--- a/internlm/utils/parallel.py
+++ b/internlm/utils/parallel.py
@ -2,6 +2,7 @@
 # -*- encoding: utf-8 -*-
 import torch.distributed as dist
 from torch import nn
 from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
@ -61,3 +62,28 @@ def get_parallel_log_file_name():
        f"tp={gpc.get_local_rank(ParallelMode.TENSOR)}_pp={gpc.get_local_rank(ParallelMode.PIPELINE)}"
    )
    return log_file_name
 def set_model_params_layer_name(model):
    r"""Set the layer name as an attribute of the model parameters.
    Args:
        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
    """
    if isinstance(model, nn.ModuleList):
        _chunk = model[0]
    else:
        _chunk = model
    # Create a unique layer name based on the block's class name and index
    for name, children in _chunk.named_children():
        if isinstance(children, nn.ModuleList):
            for idx, block in enumerate(children):
                for param in block.parameters():
                    layer_name = f"{block.__class__.__name__}.{idx}"
                    gpc.layer_names.add(layer_name)
                    param.__setattr__("layer_name", layer_name)
        else:
            for param in children.parameters():
                gpc.layer_names.add(name)
                param.__setattr__("layer_name", name)