diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 64ff4de..6e73be4 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -450,7 +450,7 @@ def build_model_with_cfg( apply_post_layer_norm=False, # pylint: disable=W0613 layer_norm_epsilon=1e-5, is_reward=False, - dropout_selective_checkpoint=True, + dropout_selective_checkpoint=False, use_scaled_init: bool = True, use_swiglu: bool = True, use_flash_attn: bool = True, diff --git a/internlm/solver/optimizer/fsdp_optimizer.py b/internlm/solver/optimizer/fsdp_optimizer.py index a19de0e..78d98a3 100644 --- a/internlm/solver/optimizer/fsdp_optimizer.py +++ b/internlm/solver/optimizer/fsdp_optimizer.py @@ -66,7 +66,7 @@ class FSDPadaptOptimizer(BaseOptimizer): self._fp16_param_groups[group_idx] = group_params # create copy of fp32 weight - fp32_tensor_param = [param.data.float().requires_grad_(True) for param in group_params] + fp32_tensor_param = [param.data.float() for param in group_params] self._fp32_param_tensor_groups[group_idx] = fp32_tensor_param # replace