diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index a060f47..941354c 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -364,6 +364,11 @@ def launch( f"data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, " f"tensor parallel size: {gpc.tensor_parallel_size}", ) + logger.info( + f"Creating MoE with num_experts: {gpc.config.model.num_experts} | " + f"expert parallel size: {gpc.expert_parallel_size} | " + f"number of local experts: {gpc.config.model.num_experts//gpc.expert_parallel_size}" + ) def launch_from_slurm( diff --git a/internlm/model/moe.py b/internlm/model/moe.py index b116937..414baa0 100644 --- a/internlm/model/moe.py +++ b/internlm/model/moe.py @@ -69,11 +69,6 @@ class MoE(torch.nn.Module): self.num_experts = num_experts self.num_local_experts = num_experts // self.ep_size - if gpc.is_rank_for_log(): - logger.info( # pylint: disable=W1203 - f"Creating MoE layer with num_experts: {num_experts} | num_local_experts:" - f"{self.num_local_experts} | expert_parallel_size: {self.ep_size}" - ) assert noisy_gate_policy is None or noisy_gate_policy in ["None", "Jitter", "RSample"], ( "Unsupported noisy_gate_policy: " + noisy_gate_policy )