mirror of https://github.com/InternLM/InternLM
compute layer norms and replace total_norm with it
parent
b3645b0244
commit
a94f429a67
|
@ -494,11 +494,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
# Gradients may not be fully synchronized here.
|
# Gradients may not be fully synchronized here.
|
||||||
|
|
||||||
def _compute_norm_with_stage(
|
def _compute_norm_with_stage(
|
||||||
self,
|
self, group_id: int = 0, last_bucket: bool = False, last_stage: bool = False, previous_layer_norms=None
|
||||||
group_id: int = 0,
|
|
||||||
last_bucket: bool = False,
|
|
||||||
last_stage: bool = False,
|
|
||||||
previous_norm=None,
|
|
||||||
):
|
):
|
||||||
# compute norm for gradients that have been reduced
|
# compute norm for gradients that have been reduced
|
||||||
params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
|
params, grads = self._param_store.get_reduced_param_for_compute_norm(group_id=group_id, last_bucket=last_bucket)
|
||||||
|
@ -507,18 +503,18 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
grads = [self.padding_grad.to(dtype)]
|
grads = [self.padding_grad.to(dtype)]
|
||||||
params = [self.padding_tensor.to(dtype)]
|
params = [self.padding_tensor.to(dtype)]
|
||||||
|
|
||||||
norm = 0
|
layer_norm = None
|
||||||
if self._clip_grad_norm > 0:
|
if self._clip_grad_norm > 0:
|
||||||
# this norm is before scaling, it will be very large
|
# this norm is before scaling, it will be very large
|
||||||
norm = compute_norm(
|
layer_norm = compute_norm(
|
||||||
gradients=grads,
|
gradients=grads,
|
||||||
parameters=params,
|
parameters=params,
|
||||||
last_stage=last_stage,
|
last_stage=last_stage,
|
||||||
previous_norm=previous_norm,
|
previous_layer_norms=previous_layer_norms,
|
||||||
zero_mode=self._broadcast_parallel_mode[group_id],
|
zero_mode=self._broadcast_parallel_mode[group_id],
|
||||||
)
|
)
|
||||||
|
|
||||||
return norm
|
return layer_norm
|
||||||
|
|
||||||
@llm_timeout(func_name="optim_step")
|
@llm_timeout(func_name="optim_step")
|
||||||
def step(self, closure=None):
|
def step(self, closure=None):
|
||||||
|
@ -546,10 +542,10 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
|
self._reduce_grads_stored_in_bucket(self._bucket_store[group_id], reduce_rank=None, last_bucket=True)
|
||||||
|
|
||||||
# compute norm for gradients in the before bucket
|
# compute norm for gradients in the before bucket
|
||||||
groups_norms = []
|
groups_layer_norms = []
|
||||||
for group_id in range(self.num_param_groups):
|
for group_id in range(self.num_param_groups):
|
||||||
groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
|
layer_norm = self._compute_norm_with_stage(group_id=group_id)
|
||||||
|
groups_layer_norms.append(layer_norm)
|
||||||
# clear reduced grads
|
# clear reduced grads
|
||||||
# grads in the last bucket is reduced
|
# grads in the last bucket is reduced
|
||||||
for bucket in self._bucket_in_progress:
|
for bucket in self._bucket_in_progress:
|
||||||
|
@ -561,15 +557,14 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# compute norm for gradients in the last bucket
|
# compute norm for gradients in the last bucket
|
||||||
total_norms = {}
|
total_norms = {}
|
||||||
|
total_layernorms = {}
|
||||||
for group_id in range(self.num_param_groups):
|
for group_id in range(self.num_param_groups):
|
||||||
group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
|
group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
|
||||||
group_name = f"{group_id}_{group_name}"
|
group_name = f"{group_id}_{group_name}"
|
||||||
total_norms[group_name] = self._compute_norm_with_stage(
|
total_layernorms[group_name] = self._compute_norm_with_stage(
|
||||||
group_id=group_id,
|
group_id=group_id, last_bucket=True, last_stage=True, previous_layer_norms=groups_layer_norms[group_id]
|
||||||
last_bucket=True,
|
|
||||||
last_stage=True,
|
|
||||||
previous_norm=groups_norms[group_id],
|
|
||||||
)
|
)
|
||||||
|
total_norms[group_name] = sum(total_layernorms[group_name].values())
|
||||||
|
|
||||||
# Need to allreduce(avg) the norms across different ranks because moe params will not be synced
|
# Need to allreduce(avg) the norms across different ranks because moe params will not be synced
|
||||||
# during allreduce
|
# during allreduce
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
import copy
|
||||||
import math
|
import math
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
@ -15,7 +16,7 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||||
from internlm.core.context import ParallelMode
|
from internlm.core.context import ParallelMode
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
from internlm.core.naive_amp import NaiveAMPModel
|
from internlm.core.naive_amp import NaiveAMPModel
|
||||||
from internlm.utils.common import get_tensor_norm, move_norm_to_cuda
|
from internlm.utils.common import move_norm_to_cuda
|
||||||
from internlm.utils.logger import get_logger
|
from internlm.utils.logger import get_logger
|
||||||
from internlm.utils.parallel import is_model_parallel_parameter
|
from internlm.utils.parallel import is_model_parallel_parameter
|
||||||
|
|
||||||
|
@ -31,6 +32,7 @@ except (ModuleNotFoundError, ImportError):
|
||||||
APEX_AVAILABLE = False
|
APEX_AVAILABLE = False
|
||||||
|
|
||||||
inf = math.inf
|
inf = math.inf
|
||||||
|
global_layer_norms = {"unknown": 0.0, "embedding": 0.0, "norm": 0.0, "head": 0.0}
|
||||||
|
|
||||||
|
|
||||||
def flatten(input_):
|
def flatten(input_):
|
||||||
|
@ -210,7 +212,7 @@ def calc_lp(grads, norm_type):
|
||||||
|
|
||||||
|
|
||||||
def compute_norm(
|
def compute_norm(
|
||||||
gradients, parameters, last_stage=False, previous_norm=None, norm_type=2, zero_mode=ParallelMode.ZERO1
|
gradients, parameters, last_stage=False, previous_layer_norms=None, norm_type=2, zero_mode=ParallelMode.ZERO1
|
||||||
):
|
):
|
||||||
"""Get the norm
|
"""Get the norm
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -220,42 +222,60 @@ def compute_norm(
|
||||||
infinity norm.
|
infinity norm.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Total norm of the parameters, need total_norm**(1/norm) before using.
|
gradient norm for all layers, need total_norm**(1/norm) before using.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
enable_cuda_kernels = gradients[0].device.type == "cuda"
|
enable_cuda_kernels = gradients[0].device.type == "cuda"
|
||||||
# Norm parameters.
|
# Norm parameters.
|
||||||
norm_type = float(norm_type)
|
norm_type = float(norm_type)
|
||||||
|
total_layer_norms = copy.deepcopy(global_layer_norms)
|
||||||
|
layer_grads = {}
|
||||||
# Calculate norm.
|
# Calculate norm.
|
||||||
if norm_type == inf:
|
if norm_type == inf:
|
||||||
total_norm = max(g.data.abs().max() for g in gradients)
|
for g, p in zip(gradients, parameters):
|
||||||
total_norm_cuda = torch.FloatTensor([float(total_norm)], device=gradients[0].device)
|
layer_name = p.layer_name if hasattr(p, "layer_name") else "unknown"
|
||||||
|
if layer_name not in layer_grads:
|
||||||
|
layer_grads[layer_name] = []
|
||||||
|
layer_grads[layer_name].append(g)
|
||||||
|
for layer_name, grads in layer_grads.items():
|
||||||
|
layer_norm = max(g.data.abs().max() for g in grads)
|
||||||
|
total_layer_norms[layer_name] = max(total_layer_norms[layer_name], float(layer_norm))
|
||||||
|
|
||||||
if last_stage is False:
|
if last_stage is False:
|
||||||
return total_norm_cuda
|
return total_layer_norms
|
||||||
|
|
||||||
if previous_norm is not None:
|
if previous_layer_norms is not None:
|
||||||
total_norm_cuda = max(total_norm_cuda, previous_norm)
|
for key, value in previous_layer_norms.items():
|
||||||
|
total_layer_norms[key] = max(value, total_layer_norms[key])
|
||||||
|
|
||||||
|
total_layer_norms_values = move_norm_to_cuda(torch.Tensor(list(total_layer_norms.values())))
|
||||||
|
total_layer_norms_keys = list(global_layer_norms.keys())
|
||||||
|
|
||||||
# Take max across all model-parallel GPUs.
|
# Take max across all model-parallel GPUs.
|
||||||
if gpc.get_world_size(ParallelMode.MODEL) > 1:
|
if gpc.is_initialized(ParallelMode.MODEL):
|
||||||
dist.all_reduce(
|
dist.all_reduce(
|
||||||
total_norm_cuda,
|
total_layer_norms_values,
|
||||||
op=dist.ReduceOp.MAX,
|
op=dist.ReduceOp.MAX,
|
||||||
group=gpc.get_group(ParallelMode.MODEL),
|
group=gpc.get_group(ParallelMode.MODEL),
|
||||||
)
|
)
|
||||||
total_norm = total_norm_cuda[0].item()
|
for idx in range(len(total_layer_norms_keys)):
|
||||||
|
layer_norm = total_layer_norms_values[idx]
|
||||||
|
if torch.is_tensor(layer_norm):
|
||||||
|
layer_norm = layer_norm.item()
|
||||||
|
total_layer_norms[total_layer_norms_keys[idx]] = layer_norm
|
||||||
|
|
||||||
else:
|
else:
|
||||||
tensor_parallel_grads = []
|
|
||||||
for g, p in zip(gradients, parameters):
|
for g, p in zip(gradients, parameters):
|
||||||
# TODO: consider the pipeline shared parameter
|
# TODO: consider the pipeline shared parameter
|
||||||
|
layer_name = p.layer_name if hasattr(p, "layer_name") else "unknown"
|
||||||
|
if layer_name not in layer_grads:
|
||||||
|
layer_grads[layer_name] = []
|
||||||
if (
|
if (
|
||||||
gpc.is_initialized(ParallelMode.PIPELINE)
|
gpc.is_initialized(ParallelMode.PIPELINE)
|
||||||
and hasattr(p, "pipeline_shared_module_pg")
|
and hasattr(p, "pipeline_shared_module_pg")
|
||||||
and dist.get_rank(p.pipeline_shared_module_pg) == 0
|
and dist.get_rank(p.pipeline_shared_module_pg) == 0
|
||||||
): # if shared between different pipe, only count o
|
): # if shared between different pipe, only count o
|
||||||
tensor_parallel_grads.append(g.data.float())
|
layer_grads[layer_name].append(g.data.float())
|
||||||
elif (
|
elif (
|
||||||
gpc.is_initialized(ParallelMode.PIPELINE)
|
gpc.is_initialized(ParallelMode.PIPELINE)
|
||||||
and hasattr(p, "pipeline_shared_module_pg")
|
and hasattr(p, "pipeline_shared_module_pg")
|
||||||
|
@ -267,56 +287,50 @@ def compute_norm(
|
||||||
and not is_model_parallel_parameter(p)
|
and not is_model_parallel_parameter(p)
|
||||||
and gpc.get_local_rank(ParallelMode.TENSOR) == 0
|
and gpc.get_local_rank(ParallelMode.TENSOR) == 0
|
||||||
): # if not used in each chunk, such as layernorm
|
): # if not used in each chunk, such as layernorm
|
||||||
tensor_parallel_grads.append(g.data.float())
|
layer_grads[layer_name].append(g.data.float())
|
||||||
elif is_model_parallel_parameter(p):
|
elif is_model_parallel_parameter(p):
|
||||||
tensor_parallel_grads.append(g.data.float())
|
layer_grads[layer_name].append(g.data.float())
|
||||||
elif gpc.get_local_rank(ParallelMode.TENSOR) != 0:
|
elif gpc.get_local_rank(ParallelMode.TENSOR) != 0:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Should not arrive here")
|
raise RuntimeError("Should not arrive here")
|
||||||
|
|
||||||
if norm_type == 2.0 and enable_cuda_kernels:
|
# calculate lay norm
|
||||||
tensor_parallel_norm = calc_l2_norm(tensor_parallel_grads) ** norm_type
|
for layer_name, grads in layer_grads.items():
|
||||||
else:
|
if norm_type == 2.0 and enable_cuda_kernels:
|
||||||
tensor_parallel_norm = calc_lp(tensor_parallel_grads, norm_type)
|
layer_norm = calc_l2_norm(grads) ** norm_type
|
||||||
|
else:
|
||||||
# If norm is type of float, then we convert them into torch.Tensor.
|
layer_norm = calc_lp(grads, norm_type)
|
||||||
tensor_parallel_norm = get_tensor_norm(tensor_parallel_norm, enable_cuda_kernels)
|
total_layer_norms[layer_name] += layer_norm.item() if torch.is_tensor(layer_norm) else layer_norm
|
||||||
# If grads are on CPU, the norms is also on CPU. Cast them to CUDA tensors
|
|
||||||
if not enable_cuda_kernels:
|
|
||||||
tensor_parallel_norm = move_norm_to_cuda(tensor_parallel_norm)
|
|
||||||
|
|
||||||
total_norm = tensor_parallel_norm
|
|
||||||
|
|
||||||
if last_stage is False:
|
if last_stage is False:
|
||||||
return total_norm
|
return total_layer_norms
|
||||||
|
|
||||||
if previous_norm is not None:
|
if previous_layer_norms is not None:
|
||||||
total_norm = total_norm + previous_norm
|
for key, value in previous_layer_norms.items():
|
||||||
|
total_layer_norms[key] += value
|
||||||
|
|
||||||
|
# sync layer norm
|
||||||
# Sum across all model-parallel GPUs.
|
# Sum across all model-parallel GPUs.
|
||||||
|
total_layer_norms_values = move_norm_to_cuda(torch.Tensor(list(total_layer_norms.values())))
|
||||||
|
total_layer_norms_keys = list(total_layer_norms.keys())
|
||||||
|
|
||||||
if gpc.is_initialized(ParallelMode.MODEL):
|
if gpc.is_initialized(ParallelMode.MODEL):
|
||||||
dist.all_reduce(
|
dist.all_reduce(total_layer_norms_values, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL))
|
||||||
total_norm,
|
dist.all_reduce(total_layer_norms_values, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
|
||||||
op=dist.ReduceOp.SUM,
|
|
||||||
group=gpc.get_group(ParallelMode.MODEL),
|
|
||||||
)
|
|
||||||
|
|
||||||
# This is because we use zero1, so we need to use this reduction.
|
for idx in range(len(total_layer_norms_keys)):
|
||||||
# TODO: Check zero group to be a subset of dp group.
|
layer_norm = total_layer_norms_values[idx]
|
||||||
dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
|
if torch.is_tensor(layer_norm):
|
||||||
|
layer_norm = layer_norm.item()
|
||||||
|
if layer_norm == float("inf") or layer_norm == -float("inf"):
|
||||||
|
layer_norm = -1
|
||||||
|
|
||||||
if torch.is_tensor(total_norm):
|
if math.isnan(layer_norm):
|
||||||
total_norm = total_norm.item()
|
layer_norm = -2
|
||||||
|
total_layer_norms[total_layer_norms_keys[idx]] = layer_norm
|
||||||
|
|
||||||
# Scale.
|
return total_layer_norms
|
||||||
if total_norm == float("inf") or total_norm == -float("inf"):
|
|
||||||
total_norm = -1
|
|
||||||
|
|
||||||
if math.isnan(total_norm):
|
|
||||||
total_norm = -2
|
|
||||||
|
|
||||||
return total_norm
|
|
||||||
|
|
||||||
|
|
||||||
class BaseGradScaler(ABC):
|
class BaseGradScaler(ABC):
|
||||||
|
@ -507,18 +521,32 @@ class ParamBcastSyncHandler:
|
||||||
for _chunk in model:
|
for _chunk in model:
|
||||||
if isinstance(_chunk, NaiveAMPModel):
|
if isinstance(_chunk, NaiveAMPModel):
|
||||||
_chunk = _chunk.model
|
_chunk = _chunk.model
|
||||||
|
# if gpc.is_rank_for_log():
|
||||||
for _, children in _chunk.named_children():
|
# logger.info(_chunk)
|
||||||
|
# [ name for name , _ in model.model.named_children()]
|
||||||
|
for name, children in _chunk.named_children():
|
||||||
# should be the transformer block definaton in modeling_xxx.py
|
# should be the transformer block definaton in modeling_xxx.py
|
||||||
if isinstance(children, nn.ModuleList):
|
if isinstance(children, nn.ModuleList):
|
||||||
# record the block that a parameter belongs to
|
# record the block that a parameter belongs to
|
||||||
for _, block in enumerate(children):
|
for idx, block in enumerate(children):
|
||||||
# self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
|
# self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
|
||||||
self._block_to_param[block] = list(block.parameters())
|
self._block_to_param[block] = list(block.parameters())
|
||||||
|
for parameter in self._block_to_param[block]:
|
||||||
|
layer_name = f"{block.__class__.__name__}.{idx}"
|
||||||
|
# if gpc.is_rank_for_log():
|
||||||
|
# logger.info(layer_name)
|
||||||
|
global_layer_norms[layer_name] = 0.0
|
||||||
|
parameter.__setattr__("layer_name", layer_name)
|
||||||
else:
|
else:
|
||||||
# record the block that a parameter belongs to
|
# record the block that a parameter belongs to
|
||||||
# self._block_to_param[name] = list(children.parameters())
|
# self._block_to_param[name] = list(children.parameters())
|
||||||
self._block_to_param[children] = list(children.parameters())
|
self._block_to_param[children] = list(children.parameters())
|
||||||
|
for parameter in self._block_to_param[children]:
|
||||||
|
layer_name = f"{children.__class__.__name__}"
|
||||||
|
# if gpc.is_rank_for_log():
|
||||||
|
# logger.info(layer_name)
|
||||||
|
# global_layer_norms[layer_name] = 0.0
|
||||||
|
parameter.__setattr__("layer_name", name)
|
||||||
|
|
||||||
alloc_num = 0
|
alloc_num = 0
|
||||||
rank_to_go = 0
|
rank_to_go = 0
|
||||||
|
|
Loading…
Reference in New Issue