mirror of https://github.com/InternLM/InternLM
modified: internlm/core/context/process_group_initializer.py
modified: internlm/core/scheduler/no_pipeline_scheduler.py modified: internlm/solver/optimizer/hybrid_zero_optim.pypull/375/head
parent
2a52452ed2
commit
84476833f3
|
@ -344,6 +344,7 @@ class Initializer_Zero1(ProcessGroupInitializer):
|
||||||
|
|
||||||
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
||||||
|
|
||||||
|
|
||||||
class Initializer_Expert(ProcessGroupInitializer):
|
class Initializer_Expert(ProcessGroupInitializer):
|
||||||
"""A ProcessGroupInitializer for zero-1 parallelism.
|
"""A ProcessGroupInitializer for zero-1 parallelism.
|
||||||
|
|
||||||
|
|
|
@ -116,7 +116,7 @@ class NonPipelineScheduler(BaseScheduler):
|
||||||
self._call_hooks("after_criterion", loss)
|
self._call_hooks("after_criterion", loss)
|
||||||
moe_loss = sum(moe_losses) * moe_loss_coeff
|
moe_loss = sum(moe_losses) * moe_loss_coeff
|
||||||
loss += moe_loss
|
loss += moe_loss
|
||||||
loss /= scale_loss ## TODO: check whether mos_loss should be scaled
|
loss /= scale_loss # TODO: check whether mos_loss should be scaled
|
||||||
|
|
||||||
# backward
|
# backward
|
||||||
if not forward_only:
|
if not forward_only:
|
||||||
|
|
|
@ -9,6 +9,7 @@ from torch.optim import Optimizer
|
||||||
|
|
||||||
from internlm.core.context import Config, ParallelMode
|
from internlm.core.context import Config, ParallelMode
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
|
from internlm.model.moe import is_moe_param
|
||||||
from internlm.solver.optimizer.store import (
|
from internlm.solver.optimizer.store import (
|
||||||
BucketStore,
|
BucketStore,
|
||||||
GradientStore,
|
GradientStore,
|
||||||
|
@ -25,7 +26,6 @@ from internlm.solver.optimizer.utils import (
|
||||||
split_half_float_double,
|
split_half_float_double,
|
||||||
sync_param,
|
sync_param,
|
||||||
)
|
)
|
||||||
from internlm.model.moe import is_moe_param
|
|
||||||
from internlm.utils.common import get_current_device
|
from internlm.utils.common import get_current_device
|
||||||
from internlm.utils.logger import get_logger
|
from internlm.utils.logger import get_logger
|
||||||
from internlm.utils.megatron_timers import megatron_timer as timer
|
from internlm.utils.megatron_timers import megatron_timer as timer
|
||||||
|
@ -537,7 +537,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
norm_groups.append(norm_group)
|
norm_groups.append(norm_group)
|
||||||
|
|
||||||
loss_scale = float(self.loss_scale.item()) # backup
|
loss_scale = float(self.loss_scale.item()) # backup
|
||||||
if not gpc.config.model.dtype is torch.float32:
|
if gpc.config.model.dtype is not torch.float32:
|
||||||
self.grad_scaler.update(found_inf)
|
self.grad_scaler.update(found_inf)
|
||||||
# update loss scale if overflow occurs
|
# update loss scale if overflow occurs
|
||||||
if found_inf:
|
if found_inf:
|
||||||
|
@ -581,7 +581,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
global_norm = sum(norm_groups) ** 0.5
|
global_norm = sum(norm_groups) ** 0.5
|
||||||
|
|
||||||
# the following operations are performed only on the rank to which parameters are assigned.
|
# the following operations are performed only on the rank to which parameters are assigned.
|
||||||
if not gpc.config.model.dtype is torch.float32:
|
if gpc.config.model.dtype is not torch.float32:
|
||||||
if len(single_grad_partition_groups) != 0:
|
if len(single_grad_partition_groups) != 0:
|
||||||
self._unscale_and_clip_grads(single_grad_partition_groups, global_norm, loss_scale)
|
self._unscale_and_clip_grads(single_grad_partition_groups, global_norm, loss_scale)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue