more wrap

2023-09-27 17:35:28 +08:00 · 2023-09-27 17:35:28 +08:00 · 80f1eb9a36
parent c703938fb3
commit 80f1eb9a36
1 changed files with 19 additions and 1 deletions
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -36,6 +36,16 @@ from internlm.model.modeling_internlm import (
    PackedFlashBaseLayer1D,
    PackedFlashInternLm1D,
 )
+
+from internlm.model.multi_head_attention import MHA
+from flash_attn.modules.mha import (
+    CrossAttention,
+    FlashCrossAttention,
+    FlashSelfAttention,
+    SelfAttention,
+    _update_kv_cache,
+)
+
 from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
@ -107,9 +117,17 @@ def initialize_model():


 def wrap_FSDP_model(model: Union[nn.Module, nn.ModuleList]):
+    from internlm.model.utils import gather_forward_split_backward, try_import_RMSNorm
+    RMSNorm = try_import_RMSNorm()
    if gpc.config.parallel.use_fsdp:
        transformer_wrap_policy = functools.partial(
-            transformer_auto_wrap_policy, transformer_layer_cls={PackedFlashBaseLayer1D, PackedFlashInternLm1D}
+            transformer_auto_wrap_policy, transformer_layer_cls={
+                PackedFlashBaseLayer1D, 
+                PackedFlashInternLm1D, 
+                MHA, 
+                FlashCrossAttention, 
+                FlashSelfAttention, 
+                RMSNorm}
        )
        grp = gpc.get_group(ParallelMode.ZERO1)
        model = FSDP(