diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index 5e44dc9..4729380 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -150,7 +150,7 @@ class ParallelContext(metaclass=SingletonMeta): self.virtual_pipeline_parallel_size = None self.virtual_pipeline_parallel_rank = None self._expert_parallel_group_names = [] - self.layer_names = {"unknown", "embedding", "norm", "head"} + self.layer_names = ["unknown"] @property def config(self): diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 2856a78..b772469 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -24,6 +24,7 @@ from internlm.solver.pipeline_utils import partition_uniform from internlm.utils.checkpoint import activation_checkpoint from internlm.utils.common import filter_kwargs from internlm.utils.logger import get_logger +from internlm.utils.parallel import set_model_params_layer_name from internlm.utils.registry import MODEL_INITIALIZER MODEL_TYPE = "INTERNLM" @@ -418,6 +419,21 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), if gpc.is_rank_for_log(): logger.info(f"The layer sharding is {all_parts}.") + # config gpc.layer_name + # get names of first and last layers + kwargs["num_layers"] = 1 + kwargs["device"] = device + kwargs["first"] = True + kwargs["last"] = True + kwargs["start_layer_idx"] = 0 + tmp_chunk = PackedFlashInternLm1D(**filter_kwargs(PackedFlashInternLm1D.__init__, kwargs)).cpu() + # get names of middle layers + for idx in range(num_layers): + layer_name = f"{PackedFlashBaseLayer1D.__name__}.{idx}" + gpc.layer_names.append(layer_name) + set_model_params_layer_name(tmp_chunk) + torch.cuda.empty_cache() + models = [] for start, end in parts: diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py index 43489bc..1c6df29 100644 --- a/internlm/model/modeling_moe.py +++ b/internlm/model/modeling_moe.py @@ -26,6 +26,7 @@ from internlm.solver.pipeline_utils import partition_uniform from internlm.utils.checkpoint import activation_checkpoint from internlm.utils.common import filter_kwargs from internlm.utils.logger import get_logger +from internlm.utils.parallel import set_model_params_layer_name from internlm.utils.registry import MODEL_INITIALIZER MODEL_TYPE = "INTERNLM_MoE" @@ -516,6 +517,21 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), if gpc.is_rank_for_log(): logger.info(f"The layer sharding is {all_parts}.") + # config gpc.layer_name + # get names of first and last layers + kwargs["num_layers"] = 1 + kwargs["device"] = device + kwargs["first"] = True + kwargs["last"] = True + kwargs["start_layer_idx"] = 0 + tmp_chunk = PackedFlashInternLm1D(**filter_kwargs(PackedFlashInternLm1D.__init__, kwargs)).cpu() + # get names of middle layers + for idx in range(num_layers): + layer_name = f"{PackedFlashBaseLayer1D.__name__}.{idx}" + gpc.layer_names.append(layer_name) + set_model_params_layer_name(tmp_chunk) + torch.cuda.empty_cache() + models = [] for start, end in parts: diff --git a/internlm/utils/parallel.py b/internlm/utils/parallel.py index 001af22..2da62e8 100644 --- a/internlm/utils/parallel.py +++ b/internlm/utils/parallel.py @@ -78,14 +78,16 @@ def set_model_params_layer_name(model): if isinstance(_chunk, NaiveAMPModel): _chunk = _chunk.model # Create a unique layer name based on the block's class name and index - for name, children in _chunk.named_children(): + for _, children in _chunk.named_children(): if isinstance(children, nn.ModuleList): for idx, block in enumerate(children): for param in block.parameters(): layer_name = f"{block.__class__.__name__}.{idx}" - gpc.layer_names.add(layer_name) + gpc.layer_names.append(layer_name) param.__setattr__("layer_name", layer_name) else: for param in children.parameters(): - gpc.layer_names.add(name) - param.__setattr__("layer_name", name) + layer_name = f"{children.__class__.__name__}" + gpc.layer_names.append(layer_name) + param.__setattr__("layer_name", layer_name) + gpc.layer_names = sorted(set(gpc.layer_names))