mirror of https://github.com/InternLM/InternLM
refactor code
parent
80972ff314
commit
548d1bd7af
|
@ -695,9 +695,11 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
|
# Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
|
||||||
# Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
|
# Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
|
||||||
is_tp_shared_params = (self._is_norm_group(self.optim.param_groups[group_id])
|
is_tp_sync_groups = (
|
||||||
or self._is_gate_group(self.optim.param_groups[group_id]))
|
self._is_norm_group(self.optim.param_groups[group_id]),
|
||||||
if is_tp_shared_params:
|
self._is_gate_group(self.optim.param_groups[group_id]),
|
||||||
|
)
|
||||||
|
if any(is_tp_sync_groups):
|
||||||
dist.all_reduce(
|
dist.all_reduce(
|
||||||
flat_fp32_avg_grads,
|
flat_fp32_avg_grads,
|
||||||
op=dist.ReduceOp.AVG,
|
op=dist.ReduceOp.AVG,
|
||||||
|
|
Loading…
Reference in New Issue