From cc5b15349da0b01f5d1307c3284ce6a0d9ca17e7 Mon Sep 17 00:00:00 2001
From: Pryest <54388244+Pryest@users.noreply.github.com>
Date: Mon, 11 Dec 2023 19:36:31 +0800
Subject: [PATCH] fix(metric): add metric dtype control (#533)

* fix(metric): add metric dtype control

* fix demo config to avoid implicity

* fix default behavior
---
 configs/7B_MoE4_sft.py    | 30 +++++++++++++++++-------------
 configs/7B_sft.py         | 28 ++++++++++++++++------------
 internlm/model/metrics.py | 11 ++++++++---
 3 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
index 92a93d0..cc94cdc 100644
--- a/configs/7B_MoE4_sft.py
+++ b/configs/7B_MoE4_sft.py
@@ -145,18 +145,18 @@ model = dict(
     moe_use_residual=False,
     moe_gate_k=2,
 )
-"""
-zero1 parallel:
-    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
-        so parameters will be divided within the range of dp.
-    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
-"""
+
+# zero1 parallel:
+#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+#         so parameters will be divided within the range of dp.
+#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+# pipeline parallel (dict):
+#     1. size: int, the size of pipeline parallel.
+#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+# tensor parallel: tensor parallel size, usually the number of GPUs per node.
+
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=1,
@@ -176,4 +176,8 @@ monitor = dict(
     ),
 )
 
-model_type = "INTERNLM_MoE"
\ No newline at end of file
+model_type = "INTERNLM_MoE"
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 7d945b4..c0a9bc8 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -146,18 +146,18 @@ model = dict(
     use_flash_attn=True,
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
 )
-"""
-zero1 parallel:
-    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
-        so parameters will be divided within the range of dp.
-    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
-"""
+
+# zero1 parallel:
+#     1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+#         so parameters will be divided within the range of dp.
+#     2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+#     3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+#         For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+# pipeline parallel (dict):
+#     1. size: int, the size of pipeline parallel.
+#     2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+# tensor parallel: tensor parallel size, usually the number of GPUs per node.
+
 parallel = dict(
     zero1=dict(size=8, fsdp=False),
     tensor=1,
@@ -177,3 +177,7 @@ monitor = dict(
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
 )
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
diff --git a/internlm/model/metrics.py b/internlm/model/metrics.py
index 1f54d06..704d2d6 100644
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@@ -26,7 +26,11 @@ class AccPerplex:
         self.device = device
         self.right = torch.Tensor([0]).to(device=device)
         self.total = torch.Tensor([0]).to(device=device)
-        self.total_log_probs = torch.Tensor([0]).to(device=device, dtype=torch.float)
+        self.metric_dtype = torch.float if gpc.config.get("metric_dtype", None) == "fp32" else None
+        if self.metric_dtype is not None:
+            self.total_log_probs = torch.Tensor([0]).to(device=device, dtype=self.metric_dtype)
+        else:
+            self.total_log_probs = torch.Tensor([0]).to(device=device)
         self.tp_pg = tp_pg
         self.dp_pg = dp_pg
         self.tp_local_rank = torch.distributed.get_rank(self.tp_pg)
@@ -128,8 +132,9 @@ class AccPerplex:
             # All reduce is needed to get the chunks from other GPUs.
             torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
 
-            predicted_logits = predicted_logits.to(dtype=torch.float)
-            shift_logits = shift_logits.to(dtype=torch.float)
+            if self.metric_dtype is not None:
+                predicted_logits = predicted_logits.to(dtype=self.metric_dtype)
+                shift_logits = shift_logits.to(dtype=self.metric_dtype)
 
             pred_exp_logits = torch.exp(predicted_logits)
             # Sum of exponential of logits along vocab dimension across all GPUs.