From 887d2d579b522cadab12571f2357d9e2cbd23aed Mon Sep 17 00:00:00 2001 From: Haze188 Date: Thu, 15 Aug 2024 14:40:26 +0800 Subject: [PATCH] [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991) --- colossalai/shardformer/modeling/deepseek.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py index a84a30972..429c4350c 100644 --- a/colossalai/shardformer/modeling/deepseek.py +++ b/colossalai/shardformer/modeling/deepseek.py @@ -666,6 +666,9 @@ def get_deepseek_flash_attention_model_forward(shard_config, sp_mode=None, sp_si if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # TODO: upgrade transformers to 4.44.0 to fix the bug, remove the hard code. + self._use_flash_attention_2 = shard_config.enable_flash_attention + self._use_sdpa = False if shard_config.enable_flash_attention else self._use_sdpa if self._use_flash_attention_2: # 2d mask is passed through the layers attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None