mirror of https://github.com/hpcaitech/ColossalAI
170 lines
7.0 KiB
Python
170 lines
7.0 KiB
Python
import math
|
|
|
|
import pytest
|
|
import torch
|
|
from einops import rearrange
|
|
|
|
from colossalai.kernel.cuda_native.mha.flash_attn_2 import HAS_FLASH_ATTN
|
|
from colossalai.kernel.cuda_native.mha.mem_eff_attn import HAS_MEM_EFF_ATTN
|
|
from colossalai.testing import clear_cache_before_run, parameterize
|
|
|
|
if HAS_MEM_EFF_ATTN or HAS_FLASH_ATTN:
|
|
from colossalai.kernel.cuda_native import ColoAttention
|
|
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
|
|
|
|
DTYPE = [torch.float16, torch.bfloat16, torch.float32]
|
|
FLASH_DTYPE = [torch.float16, torch.bfloat16]
|
|
|
|
|
|
def attention_ref(q, k, v, attn_mask=None, causal=False):
|
|
"""
|
|
attention output of the control group
|
|
"""
|
|
dtype_og = q.dtype
|
|
seqlen_q, seqlen_k = q.shape[1], k.shape[1]
|
|
d = q.shape[-1]
|
|
scale = 1.0 / math.sqrt(d)
|
|
scores = torch.einsum('bthd,bshd->bhts', q * scale, k)
|
|
|
|
if attn_mask is not None:
|
|
scores.masked_fill_(rearrange(~attn_mask, 'b s -> b 1 1 s'), float('-inf'))
|
|
if causal:
|
|
causal_mask = torch.triu(torch.ones(seqlen_q, seqlen_k, dtype=torch.bool, device=q.device), 1)
|
|
scores.masked_fill_(causal_mask, float('-inf'))
|
|
attention = torch.softmax(scores, dim=-1)
|
|
|
|
output = torch.einsum('bhts,bshd->bthd', attention, v)
|
|
output = rearrange(output, "b s h d -> b s (h d)")
|
|
|
|
# Modify the data at the positions of the mask to 0
|
|
if attn_mask is not None:
|
|
output.masked_fill_(rearrange(~attn_mask, 'b s -> b s 1'), 0.0)
|
|
|
|
return output.to(dtype=dtype_og)
|
|
|
|
|
|
@pytest.mark.skipif(not HAS_MEM_EFF_ATTN and not HAS_FLASH_ATTN, reason="xformers is not available")
|
|
@clear_cache_before_run()
|
|
@parameterize('proj_shape', [(6, 8, 4, 16)])
|
|
@parameterize('dtype', DTYPE)
|
|
@parameterize('dropout', [0.0])
|
|
def test_attention_gpt(proj_shape, dtype, dropout):
|
|
(B, S, H, D_HEAD) = proj_shape
|
|
D = H * D_HEAD
|
|
|
|
q = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
k = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
v = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
|
|
mask = [torch.ones(S - i, dtype=torch.bool, device="cuda") for i in range(B)]
|
|
mask = torch.nn.utils.rnn.pad_sequence(mask, batch_first=True)
|
|
|
|
attn = ColoAttention(D, H, dropout=dropout)
|
|
y = attn(q, k, v, attn_mask=mask, attn_mask_type=AttnMaskType.paddedcausal)
|
|
|
|
assert list(y.shape) == [B, S, D]
|
|
|
|
out_ref = attention_ref(q, k, v, mask, causal=True)
|
|
|
|
# check gradients
|
|
dy = torch.rand_like(y)
|
|
grad_q, grad_k, grad_v = torch.autograd.grad(y, (q, k, v), dy)
|
|
grad_ref_q, grad_ref_k, grad_ref_v = torch.autograd.grad(out_ref, (q, k, v), dy)
|
|
|
|
torch.allclose(y, out_ref, atol=1e-7), f"{(y - out_ref).abs().max()}"
|
|
torch.allclose(grad_q, grad_ref_q, atol=1e-7), f"{(grad_q - grad_ref_q).abs().max()}"
|
|
torch.allclose(grad_k, grad_ref_k, atol=1e-7), f"{(grad_k - grad_ref_k).abs().max()}"
|
|
torch.allclose(grad_v, grad_ref_v, atol=1e-7), f"{(grad_v - grad_ref_v).abs().max()}"
|
|
|
|
|
|
@pytest.mark.skipif(not HAS_MEM_EFF_ATTN and not HAS_FLASH_ATTN, reason="xformers is not available")
|
|
@clear_cache_before_run()
|
|
@parameterize('proj_shape', [(6, 8, 4, 16)])
|
|
@parameterize('dtype', DTYPE)
|
|
@parameterize('dropout', [0.0])
|
|
def test_attention_bert(proj_shape, dtype, dropout):
|
|
(B, S, H, D_HEAD) = proj_shape
|
|
D = H * D_HEAD
|
|
|
|
q = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
k = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
v = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
|
|
# attention mask of shape [B, S] with zero padding to max length S
|
|
mask = torch.randint(0, 2, (B, S), dtype=torch.bool, device="cuda")
|
|
|
|
attn = ColoAttention(D, H, dropout=dropout)
|
|
y = attn(q, k, v, attn_mask=mask, attn_mask_type=AttnMaskType.padding)
|
|
|
|
assert list(y.shape) == [B, S, D]
|
|
|
|
out_ref = attention_ref(q, k, v, mask, causal=False)
|
|
|
|
dy = torch.rand_like(y)
|
|
grad_q, grad_k, grad_v = torch.autograd.grad(y, (q, k, v), dy)
|
|
grad_ref_q, grad_ref_k, grad_ref_v = torch.autograd.grad(out_ref, (q, k, v), dy)
|
|
|
|
torch.allclose(y, out_ref, atol=1e-7), f"{(y - out_ref).abs().max()}"
|
|
torch.allclose(grad_q, grad_ref_q, atol=1e-7), f"{(grad_q - grad_ref_q).abs().max()}"
|
|
torch.allclose(grad_k, grad_ref_k, atol=1e-7), f"{(grad_k - grad_ref_k).abs().max()}"
|
|
torch.allclose(grad_v, grad_ref_v, atol=1e-7), f"{(grad_v - grad_ref_v).abs().max()}"
|
|
|
|
|
|
@pytest.mark.skipif(not HAS_MEM_EFF_ATTN and not HAS_FLASH_ATTN, reason="xformers is not available")
|
|
@clear_cache_before_run()
|
|
@parameterize('proj_shape', [(6, 8, 4, 16)])
|
|
@parameterize('dtype', DTYPE)
|
|
@parameterize('dropout', [0.0])
|
|
def test_attention_no_mask(proj_shape, dtype, dropout):
|
|
(B, S, H, D_HEAD) = proj_shape
|
|
D = H * D_HEAD
|
|
|
|
q = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
k = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
v = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
|
|
attn = ColoAttention(D, H, dropout=dropout)
|
|
y = attn(q, k, v)
|
|
|
|
assert list(y.shape) == [B, S, D]
|
|
|
|
out_ref = attention_ref(q, k, v, None, causal=False)
|
|
|
|
dy = torch.rand_like(y)
|
|
grad_q, grad_k, grad_v = torch.autograd.grad(y, (q, k, v), dy)
|
|
grad_ref_q, grad_ref_k, grad_ref_v = torch.autograd.grad(out_ref, (q, k, v), dy)
|
|
|
|
torch.allclose(y, out_ref, atol=1e-7), f"{(y - out_ref).abs().max()}"
|
|
torch.allclose(grad_q, grad_ref_q, atol=1e-7), f"{(grad_q - grad_ref_q).abs().max()}"
|
|
torch.allclose(grad_k, grad_ref_k, atol=1e-7), f"{(grad_k - grad_ref_k).abs().max()}"
|
|
torch.allclose(grad_v, grad_ref_v, atol=1e-7), f"{(grad_v - grad_ref_v).abs().max()}"
|
|
|
|
|
|
@pytest.mark.skipif(not HAS_MEM_EFF_ATTN and not HAS_FLASH_ATTN, reason="xformers is not available")
|
|
@clear_cache_before_run()
|
|
@parameterize('proj_shape', [(6, 24, 8, 4, 16)])
|
|
@parameterize('dtype', DTYPE)
|
|
@parameterize('dropout', [0.0])
|
|
def test_cross_attention(proj_shape, dtype, dropout):
|
|
(B, S, T, H, D_HEAD) = proj_shape
|
|
D = H * D_HEAD
|
|
|
|
q = torch.randn((B, T, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
k = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
v = torch.randn((B, S, H, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
|
|
|
|
attn = ColoAttention(D, H, dropout=dropout)
|
|
y = attn(q, k, v, attn_mask_type=AttnMaskType.causal)
|
|
|
|
assert list(y.shape) == [B, T, D]
|
|
|
|
out_ref = attention_ref(q, k, v, None, causal=True)
|
|
|
|
dy = torch.rand_like(y)
|
|
grad_q, grad_k, grad_v = torch.autograd.grad(y, (q, k, v), dy)
|
|
grad_ref_q, grad_ref_k, grad_ref_v = torch.autograd.grad(out_ref, (q, k, v), dy)
|
|
|
|
torch.allclose(y, out_ref, atol=1e-18), f"{(y - out_ref).abs().max()}"
|
|
torch.allclose(grad_q, grad_ref_q, atol=1e-7), f"{(grad_q - grad_ref_q).abs().max()}"
|
|
torch.allclose(grad_k, grad_ref_k, atol=1e-7), f"{(grad_k - grad_ref_k).abs().max()}"
|
|
torch.allclose(grad_v, grad_ref_v, atol=1e-7), f"{(grad_v - grad_ref_v).abs().max()}" |