mirror of https://github.com/InternLM/InternLM
modified: internlm/core/context/process_group_initializer.py
modified: internlm/core/scheduler/no_pipeline_scheduler.py modified: internlm/solver/optimizer/hybrid_zero_optim.pypull/375/head
parent
2a52452ed2
commit
84476833f3
|
@ -343,7 +343,8 @@ class Initializer_Zero1(ProcessGroupInitializer):
|
|||
ranks_in_group = ranks
|
||||
|
||||
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
||||
|
||||
|
||||
|
||||
class Initializer_Expert(ProcessGroupInitializer):
|
||||
"""A ProcessGroupInitializer for zero-1 parallelism.
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ class NonPipelineScheduler(BaseScheduler):
|
|||
self._call_hooks("after_criterion", loss)
|
||||
moe_loss = sum(moe_losses) * moe_loss_coeff
|
||||
loss += moe_loss
|
||||
loss /= scale_loss ## TODO: check whether mos_loss should be scaled
|
||||
loss /= scale_loss # TODO: check whether mos_loss should be scaled
|
||||
|
||||
# backward
|
||||
if not forward_only:
|
||||
|
|
|
@ -9,6 +9,7 @@ from torch.optim import Optimizer
|
|||
|
||||
from internlm.core.context import Config, ParallelMode
|
||||
from internlm.core.context import global_context as gpc
|
||||
from internlm.model.moe import is_moe_param
|
||||
from internlm.solver.optimizer.store import (
|
||||
BucketStore,
|
||||
GradientStore,
|
||||
|
@ -25,7 +26,6 @@ from internlm.solver.optimizer.utils import (
|
|||
split_half_float_double,
|
||||
sync_param,
|
||||
)
|
||||
from internlm.model.moe import is_moe_param
|
||||
from internlm.utils.common import get_current_device
|
||||
from internlm.utils.logger import get_logger
|
||||
from internlm.utils.megatron_timers import megatron_timer as timer
|
||||
|
@ -537,7 +537,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
|||
norm_groups.append(norm_group)
|
||||
|
||||
loss_scale = float(self.loss_scale.item()) # backup
|
||||
if not gpc.config.model.dtype is torch.float32:
|
||||
if gpc.config.model.dtype is not torch.float32:
|
||||
self.grad_scaler.update(found_inf)
|
||||
# update loss scale if overflow occurs
|
||||
if found_inf:
|
||||
|
@ -581,7 +581,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
|||
global_norm = sum(norm_groups) ** 0.5
|
||||
|
||||
# the following operations are performed only on the rank to which parameters are assigned.
|
||||
if not gpc.config.model.dtype is torch.float32:
|
||||
if gpc.config.model.dtype is not torch.float32:
|
||||
if len(single_grad_partition_groups) != 0:
|
||||
self._unscale_and_clip_grads(single_grad_partition_groups, global_norm, loss_scale)
|
||||
|
||||
|
|
Loading…
Reference in New Issue