[fp8] add fallback and make compile option configurable (#6092)

2024-10-18 13:55:31 +08:00 · 2024-10-18 13:55:31 +08:00 · 5ddad486ca
parent 3b1d7d1ae8
commit 5ddad486ca
2 changed files with 6 additions and 1 deletions
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@ -8,6 +8,8 @@ import torch.nn.functional as F
 from packaging.version import Version
 from torch.distributed import ReduceOp

+from .fp8_config import dynamic_kernel
+
 SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.4.0")
 SCALE_BYTES = 4
 try:
@ -832,11 +834,13 @@ class _LinearFp8(torch.autograd.Function):
        return x_grad.reshape(ctx.x_shape), w_grad, bias_grad


-@torch.compile(mode="max-autotune-no-cudagraphs", disable=not SUPPORT_TORCH_COMPILE, dynamic=False)
+@torch.compile(mode="max-autotune-no-cudagraphs", disable=not SUPPORT_TORCH_COMPILE, dynamic=dynamic_kernel)
 def _linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
    return _LinearFp8.apply(input, weight, bias)


 def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if input.shape[-1] % 16 != 0 or np.prod(input.shape[:-1]) % 16 != 0:
+        return F.linear(input, weight, bias)
    out = _linear_fp8(input, weight, bias)
    return out
--- a/colossalai/quantization/fp8_config.py
+++ b/colossalai/quantization/fp8_config.py
@ -0,0 +1 @@
+dynamic_kernel: bool = False