ColossalAI/extensions/pybind/flash_attention/flash_attention_sdpa_cuda.py

from ...base_extension import _Extension


class FlashAttentionSdpaCudaExtension(_Extension):
    def __init__(self):
        super().__init__(name="flash_attention_sdpa_cuda", support_aot=False, support_jit=False)

    def is_available(self) -> bool:
        # cuda extension can only be built if cuda is available
        try:
            import torch

            cuda_available = torch.cuda.is_available()
        except:
            cuda_available = False
        return cuda_available

    def assert_compatible(self) -> bool:
        pass

    def build_aot(self) -> None:
        raise NotImplementedError("Flash attention SDPA does not require ahead-of-time compilation.")

    def build_jit(self) -> None:
        raise NotImplementedError("Flash attention SDPA does not require just-in-time compilation.")

    def load(self):
        from typing import Optional

        import torch

        def flash_attention(
            q: torch.Tensor,
            k: torch.Tensor,
            v: torch.Tensor,
            dropout_p: float = 0.0,
            scale: Optional[float] = None,
            attention_mask: Optional[torch.Tensor] = None,
            is_causal: bool = False,
            cu_seqlens_q: Optional[torch.Tensor] = None,
            cu_seqlens_kv: Optional[torch.Tensor] = None,
            max_seqlen_q: Optional[int] = None,
            max_seqlen_kv: Optional[int] = None,
            q_indices: Optional[torch.Tensor] = None,
            kv_indices: Optional[torch.Tensor] = None,
        ):
            return torch.nn.functional.scaled_dot_product_attention(
                q,
                k,
                v,
                attn_mask=attention_mask,
                dropout_p=dropout_p,
                scale=scale,
            )

        return flash_attention
[Inference/Refactor] Refactor compilation mechanism and unified multi hw (#5613) * refactor compilation mechanism and unified multi hw * fix file path bug * add init.py to make pybind a module to avoid relative path error caused by softlink * delete duplicated micros * fix micros bug in gcc 7 months ago			`from ...base_extension import _Extension`
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago

			`class FlashAttentionSdpaCudaExtension(_Extension):`
			`def __init__(self):`
			`super().__init__(name="flash_attention_sdpa_cuda", support_aot=False, support_jit=False)`

			`def is_available(self) -> bool:`
			`# cuda extension can only be built if cuda is available`
			`try:`
			`import torch`

			`cuda_available = torch.cuda.is_available()`
			`except:`
			`cuda_available = False`
			`return cuda_available`

			`def assert_compatible(self) -> bool:`
			`pass`

			`def build_aot(self) -> None:`
			`raise NotImplementedError("Flash attention SDPA does not require ahead-of-time compilation.")`

			`def build_jit(self) -> None:`
			`raise NotImplementedError("Flash attention SDPA does not require just-in-time compilation.")`

			`def load(self):`
			`from typing import Optional`

			`import torch`

			`def flash_attention(`
			`q: torch.Tensor,`
			`k: torch.Tensor,`
			`v: torch.Tensor,`
			`dropout_p: float = 0.0,`
			`scale: Optional[float] = None,`
			`attention_mask: Optional[torch.Tensor] = None,`
			`is_causal: bool = False,`
			`cu_seqlens_q: Optional[torch.Tensor] = None,`
			`cu_seqlens_kv: Optional[torch.Tensor] = None,`
			`max_seqlen_q: Optional[int] = None,`
			`max_seqlen_kv: Optional[int] = None,`
			`q_indices: Optional[torch.Tensor] = None,`
			`kv_indices: Optional[torch.Tensor] = None,`
			`):`
			`return torch.nn.functional.scaled_dot_product_attention(`
			`q,`
			`k,`
			`v,`
			`attn_mask=attention_mask,`
			`dropout_p=dropout_p,`
			`scale=scale,`
			`)`

			`return flash_attention`