fix(train.py): fix overflow grad norm error (#230)

2023-08-24 17:46:27 +08:00 · 2023-08-24 17:46:27 +08:00 · 29dd401071
parent 2acb278e1f
commit 29dd401071
2 changed files with 2 additions and 2 deletions
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -585,7 +585,7 @@ class HybridZeroOptimizer(BaseOptimizer):
                )
            self._grad_store._averaged_gradients = dict()
            self.zero_grad()
-            return False, None
+            return False, norms

        # copy the grad of fp16 param to fp32 param
        single_grad_partition_groups = []
--- a/train.py
+++ b/train.py
@ -235,7 +235,7 @@ def main(args):
                train_state.step_count += 1
            else:
                train_state.inf_nan_skip_batches += 1  # record the amount of updating parameters unsuccessfully.
-                if -99.0 in grad_norm_groups and gpc.is_rank_for_log():  # -99.0 encodes a specific failure case
+                if -1 in grad_norm_groups and gpc.is_rank_for_log():  # -1 encodes a specific failure case
                    logger.warning(f"Warning: skip parameter update at step {batch_count}.")
                    send_alert_message(
                        address=gpc.config.alert_address,