mirror of https://github.com/InternLM/InternLM
test moe layer norm
parent
1ac7e4b489
commit
45fd0ec86b
|
@ -527,7 +527,12 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
|
params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
|
||||||
|
|
||||||
total_param_norms = {}
|
total_param_norms = {}
|
||||||
if self._clip_grad_norm > 0 and len(params) > 0:
|
if len(params) == 0:
|
||||||
|
dtype = self.param_groups[group_id]["dtype"]
|
||||||
|
grads = [self.padding_grad.to(dtype)]
|
||||||
|
params = [self.padding_tensor.to(dtype)]
|
||||||
|
|
||||||
|
if self._clip_grad_norm > 0:
|
||||||
total_param_norms = compute_param_norm(
|
total_param_norms = compute_param_norm(
|
||||||
grads,
|
grads,
|
||||||
params,
|
params,
|
||||||
|
|
|
@ -345,14 +345,15 @@ def compute_param_norm(
|
||||||
|
|
||||||
param_grads = {}
|
param_grads = {}
|
||||||
for g, p in zip(gradients, parameters):
|
for g, p in zip(gradients, parameters):
|
||||||
if p.param_name not in param_grads:
|
param_name = p.param_name if hasattr(p, "param_name") else "unknown-padding"
|
||||||
param_grads[p.param_name] = []
|
if param_name not in param_grads:
|
||||||
|
param_grads[param_name] = []
|
||||||
if (
|
if (
|
||||||
gpc.is_initialized(ParallelMode.PIPELINE)
|
gpc.is_initialized(ParallelMode.PIPELINE)
|
||||||
and hasattr(p, "pipeline_shared_module_pg")
|
and hasattr(p, "pipeline_shared_module_pg")
|
||||||
and dist.get_rank(p.pipeline_shared_module_pg) == 0
|
and dist.get_rank(p.pipeline_shared_module_pg) == 0
|
||||||
): # if shared between different pipe, only count o
|
): # if shared between different pipe, only count o
|
||||||
param_grads[p.param_name].append(g.data.float())
|
param_grads[param_name].append(g.data.float())
|
||||||
elif (
|
elif (
|
||||||
gpc.is_initialized(ParallelMode.PIPELINE)
|
gpc.is_initialized(ParallelMode.PIPELINE)
|
||||||
and hasattr(p, "pipeline_shared_module_pg")
|
and hasattr(p, "pipeline_shared_module_pg")
|
||||||
|
@ -364,9 +365,9 @@ def compute_param_norm(
|
||||||
and not is_model_parallel_parameter(p)
|
and not is_model_parallel_parameter(p)
|
||||||
and gpc.get_local_rank(ParallelMode.TENSOR) == 0
|
and gpc.get_local_rank(ParallelMode.TENSOR) == 0
|
||||||
): # if not used in each chunk, such as layernorm
|
): # if not used in each chunk, such as layernorm
|
||||||
param_grads[p.param_name].append(g.data.float())
|
param_grads[param_name].append(g.data.float())
|
||||||
elif is_model_parallel_parameter(p):
|
elif is_model_parallel_parameter(p):
|
||||||
param_grads[p.param_name].append(g.data.float())
|
param_grads[param_name].append(g.data.float())
|
||||||
elif gpc.get_local_rank(ParallelMode.TENSOR) != 0:
|
elif gpc.get_local_rank(ParallelMode.TENSOR) != 0:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue