mirror of https://github.com/InternLM/InternLM
fix(internlm/model): reset dropout_selective_checkpoint=True
parent
5bca32e4dc
commit
bd809a61f2
|
@ -461,7 +461,7 @@ def build_model_with_cfg(
|
||||||
apply_post_layer_norm=False, # pylint: disable=W0613
|
apply_post_layer_norm=False, # pylint: disable=W0613
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
is_reward=False,
|
is_reward=False,
|
||||||
dropout_selective_checkpoint=False,
|
dropout_selective_checkpoint=True,
|
||||||
use_scaled_init: bool = True,
|
use_scaled_init: bool = True,
|
||||||
use_swiglu: bool = True,
|
use_swiglu: bool = True,
|
||||||
use_flash_attn: bool = True,
|
use_flash_attn: bool = True,
|
||||||
|
|
|
@ -169,12 +169,7 @@ def get_shard_state_dict(shard_model):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO: rank0_only can save memory for non-rank0 gpu, but when tp is enabled, model saving will left some parameters
|
# FSDP model can only save with sharded shape SHARDED_STATE_DICT when set use_orig_params=True
|
||||||
# save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=False)
|
|
||||||
# with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):
|
|
||||||
# states = model.state_dict()
|
|
||||||
|
|
||||||
# in this version, FSDP model can only save with sharded shapeLOCAL_STATE_DICT
|
|
||||||
with FSDP.state_dict_type(shard_model, StateDictType.SHARDED_STATE_DICT):
|
with FSDP.state_dict_type(shard_model, StateDictType.SHARDED_STATE_DICT):
|
||||||
shard_states = shard_model.state_dict()
|
shard_states = shard_model.state_dict()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue