ColossalAI/colossalai/kernel/cuda_native/mha/mha.py

import math
from typing import Optional

import torch
import torch.nn.functional as F
from einops import rearrange

from ..scaled_softmax import AttnMaskType
from .flash_attn_2 import HAS_FLASH_ATTN
from .mem_eff_attn import HAS_MEM_EFF_ATTN
from .utils import Repad, SeqLenInfo, Unpad

if HAS_FLASH_ATTN:
    from .flash_attn_2 import flash_attention
if HAS_MEM_EFF_ATTN:
    from .mem_eff_attn import mem_eff_attention


class ColoAttention(torch.nn.Module):

    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, scale=None):
        super().__init__()
        assert embed_dim % num_heads == 0, \
            f"the embed dim ({embed_dim}) is not divisible by the number of attention heads ({num_heads})."
        if scale is not None:
            self.scale = scale
        else:
            self.scale = 1 / math.sqrt(embed_dim // num_heads)
        self.dropout = dropout

        if not HAS_MEM_EFF_ATTN and not HAS_FLASH_ATTN:
            raise Exception("flash attention can not support!")

    @staticmethod
    def unpad(tensor: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
        return Unpad.apply(tensor, indices)

    @staticmethod
    def repad(tensor: torch.Tensor, indices: torch.Tensor, batch_size: int, seq_len: int) -> torch.Tensor:
        return Repad.apply(tensor, indices, batch_size, seq_len)

    def forward(self,
                query: torch.Tensor,
                key: torch.Tensor,
                value: torch.Tensor,
                attn_mask: Optional[torch.Tensor] = None,
                attn_mask_type: Optional[AttnMaskType] = None,
                bias: Optional[torch.Tensor] = None):

        attn = None
        if HAS_FLASH_ATTN and query.dtype in [torch.float16, torch.bfloat16] and bias == None:
            attn = flash_attention
        else:
            attn = mem_eff_attention

        padded = attn_mask_type is not None and attn_mask_type.value % 2 == 1
        causal = attn_mask_type is not None and attn_mask_type.value > 1

        batch_size, tgt_len, src_len = query.shape[0], query.shape[1], key.shape[1]
        # unpad
        seq_len_info_q = None
        seq_len_info_kv = None
        if padded:
            # bert style, unpad process
            assert attn_mask is not None, \
                f"attention mask {attn_mask} is not valid for attention mask type {attn_mask_type}."
            assert attn_mask.dim() == 2, \
                "attention mask is supposed to have shape (batch_size, seq_len), " + \
                f"but got {attn_mask.dim()} dimensions."

            # bert style
            if tgt_len == src_len:
                seq_len_info_q = SeqLenInfo.materialize(attn_mask=attn_mask, device=query.device)
                if batch_size > 1:
                    query, key, value = self.unpad(torch.stack([query, key, value], dim=2),
                                                   seq_len_info_q.indices).unbind(dim=1)
                else:
                    query, key, value = torch.stack([query, key, value], dim=2).squeeze(0).unbind(dim=1)
                seq_len_info_kv = seq_len_info_q
            else:
                seq_len_info_q = SeqLenInfo.materialize(size=(batch_size, tgt_len), device=query.device)
                seq_len_info_kv = SeqLenInfo.materialize(attn_mask=attn_mask, device=query.device)
                if batch_size > 1:
                    query = rearrange(query, "b s ... -> c (b s) ...", c=1)
                    key, value = self.unpad(torch.stack([query, key, value], dim=2),
                                            seq_len_info_kv.indices).unbind(dim=1)
                else:
                    query, key, value = torch.stack([query, key, value], dim=2).squeeze(0).unbind(dim=1)

        out = attn(query,
                   key,
                   value,
                   seq_len_info_q,
                   seq_len_info_kv,
                   dropout_p=self.dropout,
                   scale=self.scale,
                   causal=causal,
                   padded=padded)

        # repad
        if padded:
            if batch_size > 1:
                out = self.repad(out, seq_len_info_q.indices, batch_size, tgt_len)
            out = rearrange(out, '(b s) h d -> b s h d', b=batch_size)

        out = rearrange(out, 'b s h d -> b s (h d)')
        return out
[fix] coloattention support flash attention 2 (#4347) Improved ColoAttention interface to support flash attention 2. Solved #4322 2023-08-04 05:46:22 +00:00			`import math`
			`from typing import Optional`

			`import torch`
			`import torch.nn.functional as F`
			`from einops import rearrange`

			`from ..scaled_softmax import AttnMaskType`
			`from .flash_attn_2 import HAS_FLASH_ATTN`
			`from .mem_eff_attn import HAS_MEM_EFF_ATTN`
			`from .utils import Repad, SeqLenInfo, Unpad`

			`if HAS_FLASH_ATTN:`
			`from .flash_attn_2 import flash_attention`
			`if HAS_MEM_EFF_ATTN:`
			`from .mem_eff_attn import mem_eff_attention`


			`class ColoAttention(torch.nn.Module):`

			`def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, scale=None):`
			`super().__init__()`
			`assert embed_dim % num_heads == 0, \`
			`f"the embed dim ({embed_dim}) is not divisible by the number of attention heads ({num_heads})."`
			`if scale is not None:`
			`self.scale = scale`
			`else:`
			`self.scale = 1 / math.sqrt(embed_dim // num_heads)`
			`self.dropout = dropout`

			`if not HAS_MEM_EFF_ATTN and not HAS_FLASH_ATTN:`
			`raise Exception("flash attention can not support!")`

			`@staticmethod`
			`def unpad(tensor: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:`
			`return Unpad.apply(tensor, indices)`

			`@staticmethod`
			`def repad(tensor: torch.Tensor, indices: torch.Tensor, batch_size: int, seq_len: int) -> torch.Tensor:`
			`return Repad.apply(tensor, indices, batch_size, seq_len)`

			`def forward(self,`
			`query: torch.Tensor,`
			`key: torch.Tensor,`
			`value: torch.Tensor,`
			`attn_mask: Optional[torch.Tensor] = None,`
			`attn_mask_type: Optional[AttnMaskType] = None,`
			`bias: Optional[torch.Tensor] = None):`

			`attn = None`
			`if HAS_FLASH_ATTN and query.dtype in [torch.float16, torch.bfloat16] and bias == None:`
			`attn = flash_attention`
			`else:`
			`attn = mem_eff_attention`

			`padded = attn_mask_type is not None and attn_mask_type.value % 2 == 1`
			`causal = attn_mask_type is not None and attn_mask_type.value > 1`

			`batch_size, tgt_len, src_len = query.shape[0], query.shape[1], key.shape[1]`
			`# unpad`
			`seq_len_info_q = None`
			`seq_len_info_kv = None`
			`if padded:`
			`# bert style, unpad process`
			`assert attn_mask is not None, \`
			`f"attention mask {attn_mask} is not valid for attention mask type {attn_mask_type}."`
			`assert attn_mask.dim() == 2, \`
			`"attention mask is supposed to have shape (batch_size, seq_len), " + \`
			`f"but got {attn_mask.dim()} dimensions."`

			`# bert style`
			`if tgt_len == src_len:`
			`seq_len_info_q = SeqLenInfo.materialize(attn_mask=attn_mask, device=query.device)`
			`if batch_size > 1:`
			`query, key, value = self.unpad(torch.stack([query, key, value], dim=2),`
			`seq_len_info_q.indices).unbind(dim=1)`
			`else:`
			`query, key, value = torch.stack([query, key, value], dim=2).squeeze(0).unbind(dim=1)`
			`seq_len_info_kv = seq_len_info_q`
			`else:`
			`seq_len_info_q = SeqLenInfo.materialize(size=(batch_size, tgt_len), device=query.device)`
			`seq_len_info_kv = SeqLenInfo.materialize(attn_mask=attn_mask, device=query.device)`
			`if batch_size > 1:`
			`query = rearrange(query, "b s ... -> c (b s) ...", c=1)`
			`key, value = self.unpad(torch.stack([query, key, value], dim=2),`
			`seq_len_info_kv.indices).unbind(dim=1)`
			`else:`
			`query, key, value = torch.stack([query, key, value], dim=2).squeeze(0).unbind(dim=1)`

			`out = attn(query,`
			`key,`
			`value,`
			`seq_len_info_q,`
			`seq_len_info_kv,`
			`dropout_p=self.dropout,`
			`scale=self.scale,`
			`causal=causal,`
			`padded=padded)`

			`# repad`
			`if padded:`
			`if batch_size > 1:`
			`out = self.repad(out, seq_len_info_q.indices, batch_size, tgt_len)`
			`out = rearrange(out, '(b s) h d -> b s h d', b=batch_size)`

			`out = rearrange(out, 'b s h d -> b s (h d)')`
			`return out`