mirror of https://github.com/InternLM/InternLM
change code comments
parent
bf6dbf07fa
commit
c0d9063a8d
|
@ -70,8 +70,7 @@ def initialize_trainer(
|
|||
|
||||
assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"
|
||||
|
||||
# gradient handler, only support PipelineSharedModuleGradientHandler now
|
||||
# TODO: can refactor code here
|
||||
# gradient handler, support PipelineSharedModuleGradientHandler and EmbeddingSharedModuleGradientHandler now
|
||||
if gpc.is_using_pp():
|
||||
gpc.config.gradient_handler = [
|
||||
dict(type="PipelineSharedModuleGradientHandler"),
|
||||
|
|
|
@ -58,7 +58,7 @@ class ScaleColumnParallelLinear(nn.Linear):
|
|||
# If not, then the input is already gathered.
|
||||
if shared_weight is None:
|
||||
if self.weight is None:
|
||||
raise RuntimeError("weight was not given in forward pass " "and skip_weight_allocation is True.")
|
||||
raise RuntimeError("weight was not given in forward pass and skip_weight_allocation is True.")
|
||||
shared_weight = self.weight
|
||||
if self.weight_scale != 1:
|
||||
weight = shared_weight * self.weight_scale + (1 - self.weight_scale) * shared_weight.detach()
|
||||
|
@ -102,7 +102,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
|
|||
skip_weight_alloction: bool = False,
|
||||
) -> None:
|
||||
# TODO have not use RewardModelLinear for now
|
||||
assert not skip_weight_alloction, "shared weight not support here for now"
|
||||
assert not skip_weight_alloction, "shared weight is not supported in RewardModelLinear for now"
|
||||
|
||||
super().__init__(in_features, out_features, process_group, bias, device, dtype, weight_scale)
|
||||
torch.distributed.broadcast(self.weight, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group)
|
||||
|
|
|
@ -331,7 +331,7 @@ class PackedFlashInternLm1D(nn.Module):
|
|||
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
|
||||
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
|
||||
(https://arxiv.org/abs/2201.05596) layer.
|
||||
tie_embeddings_and_output_weights: embedding and output layer share the same weight.
|
||||
tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
@ -529,7 +529,7 @@ class PackedFlashInternLm1D(nn.Module):
|
|||
device: Optional[torch.device] = None,
|
||||
):
|
||||
if not self.tie_embeddings_and_output_weights:
|
||||
raise Exception("initialize_word_embeddings() was called but " "tie_embeddings_and_output_weights is false")
|
||||
raise Exception("initialize_word_embeddings() was called but tie_embeddings_and_output_weights is false")
|
||||
|
||||
# This function just initializes the word embeddings in the final stage
|
||||
# when we are using pipeline parallelism. Nothing to do if we aren't
|
||||
|
@ -546,9 +546,9 @@ class PackedFlashInternLm1D(nn.Module):
|
|||
# 2. Do an all-reduce between the first and last stage to ensure that
|
||||
# the two copies of word_embeddings start off with the same
|
||||
# parameter values.
|
||||
# 3. In the training loop, before an all-reduce between the grads of
|
||||
# the two word_embeddings layers to ensure that every applied weight
|
||||
# update is the same on both stages.
|
||||
# 3. In the training loop, before step perform an all-reduce between the
|
||||
# grads of the two word_embeddings layers to ensure that every applied
|
||||
# weight update is the same on both stages.
|
||||
if gpc.is_pipeline_last_stage():
|
||||
assert not gpc.is_pipeline_first_stage()
|
||||
# set word_embeddings weights to 0 here, then copy first
|
||||
|
@ -692,7 +692,7 @@ def build_model_with_moe_cfg(
|
|||
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
|
||||
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
|
||||
(https://arxiv.org/abs/2201.05596) layer.
|
||||
tie_embeddings_and_output_weights: embedding and output layer share the same weight.
|
||||
tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight.
|
||||
"""
|
||||
|
||||
cfg = dict(
|
||||
|
|
Loading…
Reference in New Issue