From cc5b15349da0b01f5d1307c3284ce6a0d9ca17e7 Mon Sep 17 00:00:00 2001 From: Pryest <54388244+Pryest@users.noreply.github.com> Date: Mon, 11 Dec 2023 19:36:31 +0800 Subject: [PATCH] fix(metric): add metric dtype control (#533) * fix(metric): add metric dtype control * fix demo config to avoid implicity * fix default behavior --- configs/7B_MoE4_sft.py | 30 +++++++++++++++++------------- configs/7B_sft.py | 28 ++++++++++++++++------------ internlm/model/metrics.py | 11 ++++++++--- 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py index 92a93d0..cc94cdc 100644 --- a/configs/7B_MoE4_sft.py +++ b/configs/7B_MoE4_sft.py @@ -145,18 +145,18 @@ model = dict( moe_use_residual=False, moe_gate_k=2, ) -""" -zero1 parallel: - 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. - 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. -tensor parallel: tensor parallel size, usually the number of GPUs per node. -""" + +# zero1 parallel: +# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, +# so parameters will be divided within the range of dp. +# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. +# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. +# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. +# pipeline parallel (dict): +# 1. size: int, the size of pipeline parallel. +# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. +# tensor parallel: tensor parallel size, usually the number of GPUs per node. + parallel = dict( zero1=dict(size=-1, fsdp=False), tensor=1, @@ -176,4 +176,8 @@ monitor = dict( ), ) -model_type = "INTERNLM_MoE" \ No newline at end of file +model_type = "INTERNLM_MoE" + +# metric_dtype can be "fp32" or other string +# only when set to "fp32" will use fp32 to calc in metrics +# metric_dtype = "fp32" diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 7d945b4..c0a9bc8 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -146,18 +146,18 @@ model = dict( use_flash_attn=True, num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. ) -""" -zero1 parallel: - 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. - 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. -tensor parallel: tensor parallel size, usually the number of GPUs per node. -""" + +# zero1 parallel: +# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group, +# so parameters will be divided within the range of dp. +# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters. +# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size. +# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. +# pipeline parallel (dict): +# 1. size: int, the size of pipeline parallel. +# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler. +# tensor parallel: tensor parallel size, usually the number of GPUs per node. + parallel = dict( zero1=dict(size=8, fsdp=False), tensor=1, @@ -177,3 +177,7 @@ monitor = dict( alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), ) + +# metric_dtype can be "fp32" or other string +# only when set to "fp32" will use fp32 to calc in metrics +# metric_dtype = "fp32" diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py index 1f54d06..704d2d6 100644 --- a/internlm/model/metrics.py +++ b/internlm/model/metrics.py @@ -26,7 +26,11 @@ class AccPerplex: self.device = device self.right = torch.Tensor([0]).to(device=device) self.total = torch.Tensor([0]).to(device=device) - self.total_log_probs = torch.Tensor([0]).to(device=device, dtype=torch.float) + self.metric_dtype = torch.float if gpc.config.get("metric_dtype", None) == "fp32" else None + if self.metric_dtype is not None: + self.total_log_probs = torch.Tensor([0]).to(device=device, dtype=self.metric_dtype) + else: + self.total_log_probs = torch.Tensor([0]).to(device=device) self.tp_pg = tp_pg self.dp_pg = dp_pg self.tp_local_rank = torch.distributed.get_rank(self.tp_pg) @@ -128,8 +132,9 @@ class AccPerplex: # All reduce is needed to get the chunks from other GPUs. torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg) - predicted_logits = predicted_logits.to(dtype=torch.float) - shift_logits = shift_logits.to(dtype=torch.float) + if self.metric_dtype is not None: + predicted_logits = predicted_logits.to(dtype=self.metric_dtype) + shift_logits = shift_logits.to(dtype=self.metric_dtype) pred_exp_logits = torch.exp(predicted_logits) # Sum of exponential of logits along vocab dimension across all GPUs.