diff --git a/colossalai/kernel/cuda_native/mha/__init__.py b/colossalai/kernel/cuda_native/mha/__init__.py
new file mode 100644
index 000000000..21fddd512
--- /dev/null
+++ b/colossalai/kernel/cuda_native/mha/__init__.py
@@ -0,0 +1,3 @@
+from .mha import ColoAttention
+
+__all__ = ['ColoAttention']
diff --git a/tests/test_utils/test_flash_attention.py b/tests/test_utils/test_flash_attention.py
index d41ccd832..fbcc45265 100644
--- a/tests/test_utils/test_flash_attention.py
+++ b/tests/test_utils/test_flash_attention.py
@@ -9,7 +9,7 @@ from colossalai.kernel.cuda_native.mha.mem_eff_attn import HAS_MEM_EFF_ATTN
 from colossalai.testing import clear_cache_before_run, parameterize
 
 if HAS_MEM_EFF_ATTN or HAS_FLASH_ATTN:
-    from colossalai.kernel.cuda_native.mha.mha import ColoAttention
+    from colossalai.kernel.cuda_native import ColoAttention
     from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
 
 DTYPE = [torch.float16, torch.bfloat16, torch.float32]