fix(internlm/model): reset dropout_selective_checkpoint=True

2023-10-09 14:47:10 +08:00 · 2023-10-09 14:47:10 +08:00 · bd809a61f2
parent 5bca32e4dc
commit bd809a61f2
2 changed files with 2 additions and 7 deletions
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -461,7 +461,7 @@ def build_model_with_cfg(
    apply_post_layer_norm=False,  # pylint: disable=W0613
    layer_norm_epsilon=1e-5,
    is_reward=False,
-    dropout_selective_checkpoint=False,
+    dropout_selective_checkpoint=True,
    use_scaled_init: bool = True,
    use_swiglu: bool = True,
    use_flash_attn: bool = True,
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@ -169,12 +169,7 @@ def get_shard_state_dict(shard_model):
    """
-    # TODO: rank0_only can save memory for non-rank0 gpu, but when tp is enabled, model saving will left some parameters
+    # FSDP model can only save with sharded shape SHARDED_STATE_DICT when set use_orig_params=True
    # save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=False)
    # with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):
    #     states = model.state_dict()
    # in this version, FSDP model can only save with sharded shapeLOCAL_STATE_DICT
    with FSDP.state_dict_type(shard_model, StateDictType.SHARDED_STATE_DICT):
        shard_states = shard_model.state_dict()