From 279300dc5f34db219c90a297c0996d00221eae96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Wed, 24 Apr 2024 14:17:54 +0800 Subject: [PATCH] [Inference/Refactor] Refactor compilation mechanism and unified multi hw (#5613) * refactor compilation mechanism and unified multi hw * fix file path bug * add init.py to make pybind a module to avoid relative path error caused by softlink * delete duplicated micros * fix micros bug in gcc --- .../openmoe/model/modeling_openmoe.py | 2 +- extensions/__init__.py | 18 +++-- extensions/cpp_extension.py | 4 ++ extensions/csrc/common/data_type.h | 60 ++++++++++++++++ extensions/csrc/common/micros.h | 10 +++ .../{cuda/utils => common}/vec_type_traits.h | 69 ++++++------------- .../csrc/{cuda => }/funcs/binary_functor.h | 40 +++++------ .../csrc/{cuda => }/funcs/cast_functor.h | 49 ++++++------- .../csrc/{cuda => }/funcs/reduce_function.h | 7 +- .../csrc/{cuda => }/funcs/ternary_functor.h | 55 ++++++++------- .../csrc/{cuda => }/funcs/unary_functor.h | 19 +++-- .../csrc/{ => kernel}/arm/cpu_adam_arm.cpp | 0 .../csrc/{ => kernel}/arm/cpu_adam_arm.h | 0 .../{ => kernel}/cuda/activation_kernel.cu | 10 +-- .../cuda/attention/attention_utils.h | 26 +++---- .../cuda/context_kv_cache_memcpy_kernel.cu | 2 +- .../cuda/decode_kv_cache_memcpy_kernel.cu | 2 +- .../cuda/flash_decoding_attention_kernel.cu | 18 ++--- .../cuda/fused_rotary_emb_and_cache_kernel.cu | 4 +- .../cuda/get_cos_and_sin_kernel.cu | 2 +- .../{ => kernel}/cuda/layer_norm_kernel.cu | 2 +- .../csrc/{ => kernel}/cuda/moe_kernel.cu | 15 ++-- .../cuda/multi_tensor_adam_kernel.cu | 2 +- .../{ => kernel}/cuda/multi_tensor_apply.cuh | 2 +- .../cuda/multi_tensor_l2norm_kernel.cu | 3 +- .../cuda/multi_tensor_lamb_kernel.cu | 2 +- .../cuda/multi_tensor_scale_kernel.cu | 2 +- .../cuda/multi_tensor_sgd_kernel.cu | 2 +- .../{ => kernel}/cuda/rms_layernorm_kernel.cu | 18 ++--- .../cuda/scaled_masked_softmax_kernel.cu | 10 +-- ...aled_upper_triang_masked_softmax_kernel.cu | 10 +-- .../cuda/utils/gpu_launch_config.h | 0 .../csrc/{ => kernel}/cuda/utils/micros.h | 0 .../{ => kernel}/cuda/utils/nvgpu_dev_info.h | 0 .../csrc/{ => kernel}/cuda/utils/vec_copy.h | 11 ++- extensions/csrc/{ => kernel}/x86/cpu_adam.cpp | 0 extensions/csrc/{ => kernel}/x86/cpu_adam.h | 0 extensions/cuda_extension.py | 7 ++ extensions/inference/inference_ops_cuda.py | 36 ---------- extensions/pybind/__init__.py | 0 extensions/{ => pybind}/cpu_adam/__init__.py | 0 .../{ => pybind}/cpu_adam/cpu_adam_arm.py | 9 +-- .../{ => pybind}/cpu_adam/cpu_adam_x86.py | 11 ++- .../{ => pybind}/flash_attention/__init__.py | 0 .../flash_attention_dao_cuda.py | 2 +- .../flash_attention/flash_attention_npu.py | 2 +- .../flash_attention_sdpa_cuda.py | 2 +- extensions/{ => pybind}/inference/__init__.py | 0 .../pybind => pybind/inference}/inference.cpp | 0 .../pybind/inference/inference_ops_cuda.py | 31 +++++++++ extensions/{ => pybind}/layernorm/__init__.py | 0 .../layernorm}/layer_norm.cpp | 2 +- .../{ => pybind}/layernorm/layernorm_cuda.py | 12 ++-- extensions/{ => pybind}/moe/__init__.py | 0 .../{csrc/cuda/pybind => pybind/moe}/moe.cpp | 0 extensions/{ => pybind}/moe/moe_cuda.py | 14 ++-- extensions/{ => pybind}/optimizer/__init__.py | 0 .../optimizer/fused_optimizer_cuda.py | 23 +++---- .../pybind => pybind/optimizer}/optimizer.cpp | 0 extensions/{ => pybind}/softmax/__init__.py | 0 .../softmax}/scaled_masked_softmax.cpp | 0 .../softmax/scaled_masked_softmax_cuda.py | 14 ++-- .../scaled_upper_triang_masked_softmax.cpp | 0 ...aled_upper_triangle_masked_softmax_cuda.py | 14 ++-- 64 files changed, 345 insertions(+), 310 deletions(-) create mode 100644 extensions/csrc/common/data_type.h rename extensions/csrc/{cuda/utils => common}/vec_type_traits.h (66%) rename extensions/csrc/{cuda => }/funcs/binary_functor.h (92%) rename extensions/csrc/{cuda => }/funcs/cast_functor.h (87%) rename extensions/csrc/{cuda => }/funcs/reduce_function.h (97%) rename extensions/csrc/{cuda => }/funcs/ternary_functor.h (86%) rename extensions/csrc/{cuda => }/funcs/unary_functor.h (85%) rename extensions/csrc/{ => kernel}/arm/cpu_adam_arm.cpp (100%) rename extensions/csrc/{ => kernel}/arm/cpu_adam_arm.h (100%) rename extensions/csrc/{ => kernel}/cuda/activation_kernel.cu (92%) rename extensions/csrc/{ => kernel}/cuda/attention/attention_utils.h (88%) rename extensions/csrc/{ => kernel}/cuda/context_kv_cache_memcpy_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/decode_kv_cache_memcpy_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/flash_decoding_attention_kernel.cu (97%) rename extensions/csrc/{ => kernel}/cuda/fused_rotary_emb_and_cache_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/get_cos_and_sin_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/layer_norm_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/moe_kernel.cu (98%) rename extensions/csrc/{ => kernel}/cuda/multi_tensor_adam_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/multi_tensor_apply.cuh (99%) rename extensions/csrc/{ => kernel}/cuda/multi_tensor_l2norm_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/multi_tensor_lamb_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/multi_tensor_scale_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/multi_tensor_sgd_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/rms_layernorm_kernel.cu (97%) rename extensions/csrc/{ => kernel}/cuda/scaled_masked_softmax_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/scaled_upper_triang_masked_softmax_kernel.cu (99%) rename extensions/csrc/{ => kernel}/cuda/utils/gpu_launch_config.h (100%) rename extensions/csrc/{ => kernel}/cuda/utils/micros.h (100%) rename extensions/csrc/{ => kernel}/cuda/utils/nvgpu_dev_info.h (100%) rename extensions/csrc/{ => kernel}/cuda/utils/vec_copy.h (82%) rename extensions/csrc/{ => kernel}/x86/cpu_adam.cpp (100%) rename extensions/csrc/{ => kernel}/x86/cpu_adam.h (100%) delete mode 100644 extensions/inference/inference_ops_cuda.py create mode 100644 extensions/pybind/__init__.py rename extensions/{ => pybind}/cpu_adam/__init__.py (100%) rename extensions/{ => pybind}/cpu_adam/cpu_adam_arm.py (80%) rename extensions/{ => pybind}/cpu_adam/cpu_adam_x86.py (83%) rename extensions/{ => pybind}/flash_attention/__init__.py (100%) rename extensions/{ => pybind}/flash_attention/flash_attention_dao_cuda.py (98%) rename extensions/{ => pybind}/flash_attention/flash_attention_npu.py (97%) rename extensions/{ => pybind}/flash_attention/flash_attention_sdpa_cuda.py (97%) rename extensions/{ => pybind}/inference/__init__.py (100%) rename extensions/{csrc/cuda/pybind => pybind/inference}/inference.cpp (100%) create mode 100644 extensions/pybind/inference/inference_ops_cuda.py rename extensions/{ => pybind}/layernorm/__init__.py (100%) rename extensions/{csrc/cuda/pybind => pybind/layernorm}/layer_norm.cpp (99%) rename extensions/{ => pybind}/layernorm/layernorm_cuda.py (57%) rename extensions/{ => pybind}/moe/__init__.py (100%) rename extensions/{csrc/cuda/pybind => pybind/moe}/moe.cpp (100%) rename extensions/{ => pybind}/moe/moe_cuda.py (58%) rename extensions/{ => pybind}/optimizer/__init__.py (100%) rename extensions/{ => pybind}/optimizer/fused_optimizer_cuda.py (50%) rename extensions/{csrc/cuda/pybind => pybind/optimizer}/optimizer.cpp (100%) rename extensions/{ => pybind}/softmax/__init__.py (100%) rename extensions/{csrc/cuda/pybind => pybind/softmax}/scaled_masked_softmax.cpp (100%) rename extensions/{ => pybind}/softmax/scaled_masked_softmax_cuda.py (66%) rename extensions/{csrc/cuda/pybind => pybind/softmax}/scaled_upper_triang_masked_softmax.cpp (100%) rename extensions/{ => pybind}/softmax/scaled_upper_triangle_masked_softmax_cuda.py (65%) diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/examples/language/openmoe/model/modeling_openmoe.py index fdd8442f5..709e82baa 100644 --- a/examples/language/openmoe/model/modeling_openmoe.py +++ b/examples/language/openmoe/model/modeling_openmoe.py @@ -35,7 +35,7 @@ from transformers.utils import ( replace_return_docstrings, ) -from colossalai.kernel.extensions.flash_attention import HAS_FLASH_ATTN +from colossalai.kernel.extensions.pybind.flash_attention import HAS_FLASH_ATTN from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON from colossalai.moe.layers import SparseMLP from colossalai.moe.manager import MOE_MANAGER diff --git a/extensions/__init__.py b/extensions/__init__.py index 1e936eec6..c392a16b5 100644 --- a/extensions/__init__.py +++ b/extensions/__init__.py @@ -1,10 +1,14 @@ -from .cpu_adam import CpuAdamArmExtension, CpuAdamX86Extension -from .flash_attention import FlashAttentionDaoCudaExtension, FlashAttentionNpuExtension, FlashAttentionSdpaCudaExtension -from .inference import InferenceOpsCudaExtension -from .layernorm import LayerNormCudaExtension -from .moe import MoeCudaExtension -from .optimizer import FusedOptimizerCudaExtension -from .softmax import ScaledMaskedSoftmaxCudaExtension, ScaledUpperTriangleMaskedSoftmaxCudaExtension +from .pybind.cpu_adam import CpuAdamArmExtension, CpuAdamX86Extension +from .pybind.flash_attention import ( + FlashAttentionDaoCudaExtension, + FlashAttentionNpuExtension, + FlashAttentionSdpaCudaExtension, +) +from .pybind.inference import InferenceOpsCudaExtension +from .pybind.layernorm import LayerNormCudaExtension +from .pybind.moe import MoeCudaExtension +from .pybind.optimizer import FusedOptimizerCudaExtension +from .pybind.softmax import ScaledMaskedSoftmaxCudaExtension, ScaledUpperTriangleMaskedSoftmaxCudaExtension ALL_EXTENSIONS = [ CpuAdamArmExtension, diff --git a/extensions/cpp_extension.py b/extensions/cpp_extension.py index 3adb65fb8..aaa43f964 100644 --- a/extensions/cpp_extension.py +++ b/extensions/cpp_extension.py @@ -25,6 +25,9 @@ class _CppExtension(_Extension): def csrc_abs_path(self, path): return os.path.join(self.relative_to_abs_path("csrc"), path) + def pybind_abs_path(self, path): + return os.path.join(self.relative_to_abs_path("pybind"), path) + def relative_to_abs_path(self, code_path: str) -> str: """ This function takes in a path relative to the colossalai root directory and return the absolute path. @@ -116,6 +119,7 @@ class _CppExtension(_Extension): """ This function should return a list of include files for extensions. """ + return [self.csrc_abs_path("")] @abstractmethod def cxx_flags(self) -> List[str]: diff --git a/extensions/csrc/common/data_type.h b/extensions/csrc/common/data_type.h new file mode 100644 index 000000000..1327c51d3 --- /dev/null +++ b/extensions/csrc/common/data_type.h @@ -0,0 +1,60 @@ +#pragma once + +#if defined(COLOSSAL_WITH_CUDA) +#include +#include +#endif + +namespace colossalAI { +namespace dtype { + +struct bfloat164 { +#ifdef COLOSSAL_WITH_CUDA + __nv_bfloat162 x; + __nv_bfloat162 y; +#endif +}; + +struct bfloat168 { +#ifdef COLOSSAL_WITH_CUDA + __nv_bfloat162 x; + __nv_bfloat162 y; + __nv_bfloat162 z; + __nv_bfloat162 w; +#endif +}; + +struct half4 { +#ifdef COLOSSAL_WITH_CUDA + half2 x; + half2 y; +#endif +}; + +struct half8 { +#ifdef COLOSSAL_WITH_CUDA + half2 x; + half2 y; + half2 z; + half2 w; +#endif +}; + +struct float4_ { +#ifdef COLOSSAL_WITH_CUDA + float2 x; + float2 y; +#endif +}; + +struct float8_ { +#ifdef COLOSSAL_WITH_CUDA + float2 x; + float2 y; + float2 z; + float2 w; +#endif +}; + +} // namespace dtype +} // namespace colossalAI diff --git a/extensions/csrc/common/micros.h b/extensions/csrc/common/micros.h index fd489d764..cf7d0ce35 100644 --- a/extensions/csrc/common/micros.h +++ b/extensions/csrc/common/micros.h @@ -222,3 +222,13 @@ AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), \ "'"); \ } + +#if defined(COLOSSAL_WITH_CUDA) +#define HOST __host__ +#define DEVICE __device__ +#define HOSTDEVICE __host__ __device__ +#else +#define HOST +#define DEVICE +#define HOSTDEVICE +#endif diff --git a/extensions/csrc/cuda/utils/vec_type_traits.h b/extensions/csrc/common/vec_type_traits.h similarity index 66% rename from extensions/csrc/cuda/utils/vec_type_traits.h rename to extensions/csrc/common/vec_type_traits.h index 3a78a93c8..6ea6d7a38 100644 --- a/extensions/csrc/cuda/utils/vec_type_traits.h +++ b/extensions/csrc/common/vec_type_traits.h @@ -1,48 +1,16 @@ #pragma once +#if defined(COLOSSAL_WITH_CUDA) #include #include +#endif + #include -#include -#include +#include "common/data_type.h" namespace colossalAI { -namespace cuda { -namespace utils { - -struct bfloat164 { - __nv_bfloat162 x; - __nv_bfloat162 y; -}; -struct bfloat168 { - __nv_bfloat162 x; - __nv_bfloat162 y; - __nv_bfloat162 z; - __nv_bfloat162 w; -}; - -struct half4 { - half2 x; - half2 y; -}; -struct half8 { - half2 x; - half2 y; - half2 z; - half2 w; -}; - -struct float4_ { - float2 x; - float2 y; -}; -struct float8_ { - float2 x; - float2 y; - float2 z; - float2 w; -}; +namespace common { template struct VecTypeTrait {}; @@ -57,6 +25,8 @@ struct FloatVecTypeTrait {}; }; VEC_TYPE_TRAITS_SPECIALIZATION(T, 1, T, typename T) + +#if defined(COLOSSAL_WITH_CUDA) VEC_TYPE_TRAITS_SPECIALIZATION(at::BFloat16, 1, __nv_bfloat16) VEC_TYPE_TRAITS_SPECIALIZATION(at::BFloat16, 2, __nv_bfloat162) VEC_TYPE_TRAITS_SPECIALIZATION(at::BFloat16, 4, float2) @@ -67,16 +37,17 @@ VEC_TYPE_TRAITS_SPECIALIZATION(at::Half, 4, float2) VEC_TYPE_TRAITS_SPECIALIZATION(at::Half, 8, float4) VEC_TYPE_TRAITS_SPECIALIZATION(float, 2, float2) VEC_TYPE_TRAITS_SPECIALIZATION(float, 4, float4) -VEC_TYPE_TRAITS_SPECIALIZATION(float, 8, float8_) +VEC_TYPE_TRAITS_SPECIALIZATION(float, 8, dtype::float8_) VEC_TYPE_TRAITS_SPECIALIZATION(uint8_t, 2, half) VEC_TYPE_TRAITS_SPECIALIZATION(uint8_t, 4, half2) VEC_TYPE_TRAITS_SPECIALIZATION(uint8_t, 8, float2) VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 2, __nv_bfloat162); -VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 4, bfloat164); -VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 8, bfloat168); +VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 4, dtype::bfloat164); +VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 8, dtype::bfloat168); VEC_TYPE_TRAITS_SPECIALIZATION(half, 2, half2); -VEC_TYPE_TRAITS_SPECIALIZATION(half, 4, half4); -VEC_TYPE_TRAITS_SPECIALIZATION(half, 8, half8); +VEC_TYPE_TRAITS_SPECIALIZATION(half, 4, dtype::half4); +VEC_TYPE_TRAITS_SPECIALIZATION(half, 8, dtype::half8); +#endif /* defined(COLOSSAL_WITH_CUDA) */ #undef VEC_TYPE_TRAITS_SPECIALIZATION @@ -86,17 +57,17 @@ VEC_TYPE_TRAITS_SPECIALIZATION(half, 8, half8); using Type = FLOATT; \ }; +#if defined(COLOSSAL_WITH_CUDA) FLOATVEC_TYPE_TRAITS_SPECIALIZATION(float2, float2) FLOATVEC_TYPE_TRAITS_SPECIALIZATION(float4, float4) FLOATVEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat162, float2); -FLOATVEC_TYPE_TRAITS_SPECIALIZATION(bfloat164, float4_); -FLOATVEC_TYPE_TRAITS_SPECIALIZATION(bfloat168, float8_); +FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::bfloat164, dtype::float4_); +FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::bfloat168, dtype::float8_); FLOATVEC_TYPE_TRAITS_SPECIALIZATION(half2, float2); -FLOATVEC_TYPE_TRAITS_SPECIALIZATION(half4, float4_); -FLOATVEC_TYPE_TRAITS_SPECIALIZATION(half8, float8_); +FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::half4, dtype::float4_); +FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::half8, dtype::float8_); +#endif /* COLOSSAL_WITH_CUDA */ #undef FLOATVEC_TYPE_TRAITS_SPECIALIZATION - -} // namespace utils -} // namespace cuda +} // namespace common } // namespace colossalAI diff --git a/extensions/csrc/cuda/funcs/binary_functor.h b/extensions/csrc/funcs/binary_functor.h similarity index 92% rename from extensions/csrc/cuda/funcs/binary_functor.h rename to extensions/csrc/funcs/binary_functor.h index e5a68d938..c5fe48076 100644 --- a/extensions/csrc/cuda/funcs/binary_functor.h +++ b/extensions/csrc/funcs/binary_functor.h @@ -1,27 +1,21 @@ #pragma once +#if defined(COLOSSAL_WITH_CUDA) #include #include #include #include +#endif #include -#include "../utils/micros.h" -#include "../utils/vec_type_traits.h" #include "cast_functor.h" +#include "common/data_type.h" +#include "common/micros.h" namespace colossalAI { -namespace cuda { namespace funcs { -using utils::bfloat164; -using utils::bfloat168; -using utils::float4_; -using utils::float8_; -using utils::half4; -using utils::half8; - enum class BinaryOpType { kAdd = 0, kMinus, kMul, kDiv, kMax, kMin }; // Note(LiuYang): This file provides base math operation for data type @@ -61,6 +55,7 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(T, T, T, BinaryOpType::kMin, HOSTDEVICE, STMTS_WRAPPER({ return min(lhs, rhs); }), typename T) +#if defined(COLOSSAL_WITH_CUDA) COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(half, half, half, BinaryOpType::kAdd, DEVICE, STMTS_WRAPPER({ return __hadd(lhs, rhs); @@ -151,8 +146,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( })) COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( - bfloat164, bfloat164, float4_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({ - float4_ fc; + dtype::bfloat164, dtype::bfloat164, dtype::float4_, BinaryOpType::kMul, + DEVICE, STMTS_WRAPPER({ + dtype::float4_ fc; BinaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2, BinaryOpType::kMul> mul; @@ -162,8 +158,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( })) COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( - bfloat168, bfloat168, float8_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({ - float8_ fc; + dtype::bfloat168, dtype::bfloat168, dtype::float8_, BinaryOpType::kMul, + DEVICE, STMTS_WRAPPER({ + dtype::float8_ fc; BinaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2, BinaryOpType::kMul> mul; @@ -184,8 +181,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( })) COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( - half4, half4, float4_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({ - float4_ fc; + dtype::half4, dtype::half4, dtype::float4_, BinaryOpType::kMul, DEVICE, + STMTS_WRAPPER({ + dtype::float4_ fc; BinaryOpFunctor mul; fc.x = mul(lhs.x, rhs.x); fc.y = mul(lhs.y, rhs.y); @@ -193,8 +191,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( })) COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( - half8, half8, float8_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({ - float8_ fc; + dtype::half8, dtype::half8, dtype::float8_, BinaryOpType::kMul, DEVICE, + STMTS_WRAPPER({ + dtype::float8_ fc; BinaryOpFunctor mul; fc.x = mul(lhs.x, rhs.x); fc.y = mul(lhs.y, rhs.y); @@ -203,10 +202,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION( return fc; })) -#undef COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION +#endif /* defined(COLOSSAL_WITH_CUDA) */ +#undef COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION #undef STMTS_WRAPPER - } // namespace funcs -} // namespace cuda } // namespace colossalAI diff --git a/extensions/csrc/cuda/funcs/cast_functor.h b/extensions/csrc/funcs/cast_functor.h similarity index 87% rename from extensions/csrc/cuda/funcs/cast_functor.h rename to extensions/csrc/funcs/cast_functor.h index d78ca4af2..7fc22fb44 100644 --- a/extensions/csrc/cuda/funcs/cast_functor.h +++ b/extensions/csrc/funcs/cast_functor.h @@ -1,29 +1,23 @@ #pragma once +#if defined(COLOSSAL_WITH_CUDA) #include #include #include #include +#endif #include -#include "../utils/micros.h" -#include "../utils/vec_type_traits.h" +#include "common/data_type.h" +#include "common/micros.h" // Note(LiuYang): This file provides base math operation for data type // include POD and cuda built-in type such as half and __nv_bfloat16 namespace colossalAI { -namespace cuda { namespace funcs { -using utils::bfloat164; -using utils::bfloat168; -using utils::float4_; -using utils::float8_; -using utils::half4; -using utils::half8; - template struct CastFunctor : public std::unary_function { HOSTDEVICE To operator()(From val) { return static_cast(val); } @@ -36,6 +30,7 @@ struct CastFunctor : public std::unary_function { FUNCTION_MODIFIER TO operator()(FROM val) STMTS \ }; +#if defined(COLOSSAL_WITH_CUDA) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( int2, float2, { return make_float2(val.x, val.y); }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( @@ -54,27 +49,27 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( half, float, { return __half2float(val); }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float4, half4, + float4, dtype::half4, { - half4 dst; + dtype::half4 dst; dst.x = __floats2half2_rn(val.x, val.y); dst.y = __floats2half2_rn(val.z, val.w); return dst; }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float4_, half4, + dtype::float4_, dtype::half4, { - half4 dst; + dtype::half4 dst; dst.x = __float22half2_rn(val.x); dst.y = __float22half2_rn(val.y); return dst; }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float8_, half8, + dtype::float8_, dtype::half8, { - half8 dst; + dtype::half8 dst; dst.x = __float22half2_rn(val.x); dst.y = __float22half2_rn(val.y); dst.z = __float22half2_rn(val.z); @@ -88,9 +83,9 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( float, __nv_bfloat16, { return __float2bfloat16_rn(val); }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float4, bfloat164, + float4, dtype::bfloat164, { - bfloat164 dst; + dtype::bfloat164 dst; dst.x = __floats2bfloat162_rn(val.x, val.y); dst.y = __floats2bfloat162_rn(val.z, val.w); return dst; @@ -105,18 +100,18 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( float2, __nv_bfloat162, { return __float22bfloat162_rn(val); }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float4_, bfloat164, + dtype::float4_, dtype::bfloat164, { - bfloat164 dst; + dtype::bfloat164 dst; dst.x = __float22bfloat162_rn(val.x); dst.y = __float22bfloat162_rn(val.y); return dst; }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float8_, bfloat168, + dtype::float8_, dtype::bfloat168, { - bfloat168 dst; + dtype::bfloat168 dst; dst.x = __float22bfloat162_rn(val.x); dst.y = __float22bfloat162_rn(val.y); dst.z = __float22bfloat162_rn(val.z); @@ -141,18 +136,18 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( float2, __nv_bfloat162, { return __floats2bfloat162_rn(val.x, val.y); }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float4_, bfloat164, + dtype::float4_, dtype::bfloat164, { - bfloat164 dst; + dtype::bfloat164 dst; dst.x = __floats2bfloat162_rn(val.x.x, val.x.y); dst.y = __floats2bfloat162_rn(val.y.x, val.y.y); return dst; }, DEVICE) COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( - float8_, bfloat168, + dtype::float8_, dtype::bfloat168, { - bfloat168 dst; + dtype::bfloat168 dst; dst.x = __floats2bfloat162_rn(val.x.x, val.x.y); dst.y = __floats2bfloat162_rn(val.y.x, val.y.y); dst.z = __floats2bfloat162_rn(val.z.x, val.z.y); @@ -161,8 +156,8 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION( }, DEVICE) #endif /* defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 */ +#endif /* defined(COLOSSAL_WITH_CUDA) */ #undef COLOSSAL_CAST_FUNCTOR_SPECIALIZATION } // namespace funcs -} // namespace cuda } // namespace colossalAI diff --git a/extensions/csrc/cuda/funcs/reduce_function.h b/extensions/csrc/funcs/reduce_function.h similarity index 97% rename from extensions/csrc/cuda/funcs/reduce_function.h rename to extensions/csrc/funcs/reduce_function.h index da2743e62..58ff1e5bc 100644 --- a/extensions/csrc/cuda/funcs/reduce_function.h +++ b/extensions/csrc/funcs/reduce_function.h @@ -1,13 +1,13 @@ #pragma once +#if defined(COLOSSAL_WITH_CUDA) #include #include #include -#include "../funcs/binary_functor.h" +#include "binary_functor.h" namespace colossalAI { -namespace cuda { namespace funcs { const float kReduceFloatInfNeg = -100000000.f; @@ -89,5 +89,6 @@ __forceinline__ __device__ void block_reduce(T* pval) { #undef COLOSSAL_BLOCK_REDUCE_IMPL } // namespace funcs -} // namespace cuda } // namespace colossalAI + +#endif /* defined(COLOSSAL_WITH_CUDA) */ diff --git a/extensions/csrc/cuda/funcs/ternary_functor.h b/extensions/csrc/funcs/ternary_functor.h similarity index 86% rename from extensions/csrc/cuda/funcs/ternary_functor.h rename to extensions/csrc/funcs/ternary_functor.h index 34b01cdf5..c7d8039de 100644 --- a/extensions/csrc/cuda/funcs/ternary_functor.h +++ b/extensions/csrc/funcs/ternary_functor.h @@ -1,18 +1,20 @@ #pragma once +#if defined(COLOSSAL_WITH_CUDA) #include #include #include #include +#endif + #include #include -#include "../funcs/cast_functor.h" -#include "../utils/micros.h" +#include "cast_functor.h" +#include "common/micros.h" namespace colossalAI { -namespace cuda { namespace funcs { enum class TernaryOpType { kFma = 0 }; @@ -29,6 +31,7 @@ struct TernaryOpFunctor; FUNCTION_MODIFIER RET operator()(LT a, RT b, RET c) STMTS \ }; +#if defined(COLOSSAL_WITH_CUDA) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(float, float, float, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ @@ -91,16 +94,18 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fma(cast(a), b, c); })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - half4, half4, float4_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ - float4_ fd; + dtype::half4, dtype::half4, dtype::float4_, TernaryOpType::kFma, DEVICE, + STMTS_WRAPPER({ + dtype::float4_ fd; TernaryOpFunctor fma; fd.x = fma(a.x, b.x, c.x); fd.y = fma(a.y, b.y, c.y); return fd; })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - half, half4, float4_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ - float4_ fd; + half, dtype::half4, dtype::float4_, TernaryOpType::kFma, DEVICE, + STMTS_WRAPPER({ + dtype::float4_ fd; CastFunctor cast; TernaryOpFunctor fma; half2 s = cast(a); @@ -109,8 +114,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fd; })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - half8, half8, float8_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ - float8_ fd; + dtype::half8, dtype::half8, dtype::float8_, TernaryOpType::kFma, DEVICE, + STMTS_WRAPPER({ + dtype::float8_ fd; TernaryOpFunctor fma; fd.x = fma(a.x, b.x, c.x); fd.y = fma(a.y, b.y, c.y); @@ -119,8 +125,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fd; })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - half, half8, float8_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ - float8_ fd; + half, dtype::half8, dtype::float8_, TernaryOpType::kFma, DEVICE, + STMTS_WRAPPER({ + dtype::float8_ fd; CastFunctor cast; TernaryOpFunctor fma; half2 s = cast(a); @@ -153,8 +160,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fma(cast(a), b, c); })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - bfloat164, bfloat164, float4_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ - float4_ fd; + dtype::bfloat164, dtype::bfloat164, dtype::float4_, TernaryOpType::kFma, + DEVICE, STMTS_WRAPPER({ + dtype::float4_ fd; TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2, TernaryOpType::kFma> fma; @@ -163,9 +171,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fd; })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - __nv_bfloat16, bfloat164, float4_, TernaryOpType::kFma, DEVICE, - STMTS_WRAPPER({ - float4_ fd; + __nv_bfloat16, dtype::bfloat164, dtype::float4_, TernaryOpType::kFma, + DEVICE, STMTS_WRAPPER({ + dtype::float4_ fd; CastFunctor<__nv_bfloat16, __nv_bfloat162> cast; TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2, TernaryOpType::kFma> @@ -176,8 +184,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fd; })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - bfloat168, bfloat168, float8_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({ - float8_ fd; + dtype::bfloat168, dtype::bfloat168, dtype::float8_, TernaryOpType::kFma, + DEVICE, STMTS_WRAPPER({ + dtype::float8_ fd; TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2, TernaryOpType::kFma> fma; @@ -188,9 +197,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fd; })) COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( - __nv_bfloat16, bfloat168, float8_, TernaryOpType::kFma, DEVICE, - STMTS_WRAPPER({ - float8_ fd; + __nv_bfloat16, dtype::bfloat168, dtype::float8_, TernaryOpType::kFma, + DEVICE, STMTS_WRAPPER({ + dtype::float8_ fd; CastFunctor<__nv_bfloat16, __nv_bfloat162> cast; TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2, TernaryOpType::kFma> @@ -203,10 +212,10 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION( return fd; })) -#undef COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION +#endif /* defined(COLOSSAL_WITH_CUDA) */ +#undef COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION #undef STMTS_WRAPPER } // namespace funcs -} // namespace cuda } // namespace colossalAI diff --git a/extensions/csrc/cuda/funcs/unary_functor.h b/extensions/csrc/funcs/unary_functor.h similarity index 85% rename from extensions/csrc/cuda/funcs/unary_functor.h rename to extensions/csrc/funcs/unary_functor.h index b8cd3c1a1..e1d23792a 100644 --- a/extensions/csrc/cuda/funcs/unary_functor.h +++ b/extensions/csrc/funcs/unary_functor.h @@ -1,16 +1,18 @@ #pragma once +#if defined(COLOSSAL_WITH_CUDA) #include #include #include #include +#endif #include -#include "../utils/micros.h" +#include "common/data_type.h" +#include "common/micros.h" namespace colossalAI { -namespace cuda { namespace funcs { template @@ -57,27 +59,30 @@ COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(int, int, UnaryOpType::kLog2Ceil, return log2_value; }) +#if defined(COLOSSAL_WITH_CUDA) + COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float2, float, UnaryOpType::kSum, DEVICE, { return val.x + val.y; }) COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float4, float, UnaryOpType::kSum, DEVICE, { return val.x + val.y + val.z + val.w; }) -COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float4_, float, UnaryOpType::kSum, DEVICE, - { +COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(dtype::float4_, float, UnaryOpType::kSum, + DEVICE, { return val.x.x + val.x.y + val.y.x + val.y.y; }) -COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float8_, float, UnaryOpType::kSum, DEVICE, - { +COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(dtype::float8_, float, UnaryOpType::kSum, + DEVICE, { return val.x.x + val.x.y + val.y.x + val.y.y + val.z.x + val.z.y + val.w.x + val.w.y; }) +#endif /* defined(COLOSSAL_WITH_CUDA) */ + #undef COLOSSAL_UARY_FUNCTOR_SPECIALIZATION } // namespace funcs -} // namespace cuda } // namespace colossalAI diff --git a/extensions/csrc/arm/cpu_adam_arm.cpp b/extensions/csrc/kernel/arm/cpu_adam_arm.cpp similarity index 100% rename from extensions/csrc/arm/cpu_adam_arm.cpp rename to extensions/csrc/kernel/arm/cpu_adam_arm.cpp diff --git a/extensions/csrc/arm/cpu_adam_arm.h b/extensions/csrc/kernel/arm/cpu_adam_arm.h similarity index 100% rename from extensions/csrc/arm/cpu_adam_arm.h rename to extensions/csrc/kernel/arm/cpu_adam_arm.h diff --git a/extensions/csrc/cuda/activation_kernel.cu b/extensions/csrc/kernel/cuda/activation_kernel.cu similarity index 92% rename from extensions/csrc/cuda/activation_kernel.cu rename to extensions/csrc/kernel/cuda/activation_kernel.cu index 372b30387..c69003d84 100644 --- a/extensions/csrc/cuda/activation_kernel.cu +++ b/extensions/csrc/kernel/cuda/activation_kernel.cu @@ -2,13 +2,15 @@ #include #include -#include "../common/micros.h" -#include "../common/mp_type_traits.h" +#include "common/micros.h" +#include "common/mp_type_traits.h" + +using colossalAI::common::MPTypeTrait; template __device__ __forceinline__ T silu_kernel(const T& x) { // x * sigmoid(x) - using MT = typename colossalAI::common::MPTypeTrait::Type; + using MT = typename MPTypeTrait::Type; return static_cast((static_cast(x)) / (static_cast(1.0f) + expf(static_cast(-x)))); } @@ -17,7 +19,7 @@ __global__ void act_and_mul_kernel( const scalar_t* __restrict__ ins_data, scalar_t* __restrict__ outs_data, const int64_t numel) { - using MT = typename colossalAI::common::MPTypeTrait::Type; + using MT = typename MPTypeTrait::Type; int64_t idx = static_cast(threadIdx.x) + static_cast(blockIdx.x) * static_cast(blockDim.x); const int64_t grid_size = blockDim.x * gridDim.x; diff --git a/extensions/csrc/cuda/attention/attention_utils.h b/extensions/csrc/kernel/cuda/attention/attention_utils.h similarity index 88% rename from extensions/csrc/cuda/attention/attention_utils.h rename to extensions/csrc/kernel/cuda/attention/attention_utils.h index c55033636..fa555fdc8 100644 --- a/extensions/csrc/cuda/attention/attention_utils.h +++ b/extensions/csrc/kernel/cuda/attention/attention_utils.h @@ -23,24 +23,16 @@ #include #include -#include "../funcs/binary_functor.h" -#include "../funcs/cast_functor.h" -#include "../funcs/ternary_functor.h" -#include "../funcs/unary_functor.h" -#include "../utils/vec_type_traits.h" +#include "common/vec_type_traits.h" +#include "funcs/binary_functor.h" +#include "funcs/cast_functor.h" +#include "funcs/ternary_functor.h" +#include "funcs/unary_functor.h" namespace colossalAI { namespace cuda { namespace attention { -using colossalAI::cuda::funcs::BinaryOpFunctor; -using colossalAI::cuda::funcs::BinaryOpType; -using colossalAI::cuda::funcs::TernaryOpFunctor; -using colossalAI::cuda::funcs::TernaryOpType; -using colossalAI::cuda::funcs::UnaryOpFunctor; -using colossalAI::cuda::funcs::UnaryOpType; -using colossalAI::cuda::utils::FloatVecTypeTrait; - #define WARP_SIZE 32 #define VEC_SIZE_8 8 @@ -51,11 +43,11 @@ using colossalAI::cuda::utils::FloatVecTypeTrait; // Q*K^T operation. template inline __device__ float qk_dot_(const VecT (&q)[N], const VecT (&k)[N]) { - using A_vec = typename FloatVecTypeTrait::Type; + using A_vec = typename common::FloatVecTypeTrait::Type; // Compute the parallel products for Q*K^T (treat vector lanes separately). - BinaryOpFunctor mul_vect; - UnaryOpFunctor sum_vect; - TernaryOpFunctor fma; + funcs::BinaryOpFunctor mul_vect; + funcs::UnaryOpFunctor sum_vect; + funcs::TernaryOpFunctor fma; A_vec qk_vec = mul_vect(q[0], k[0]); #pragma unroll diff --git a/extensions/csrc/cuda/context_kv_cache_memcpy_kernel.cu b/extensions/csrc/kernel/cuda/context_kv_cache_memcpy_kernel.cu similarity index 99% rename from extensions/csrc/cuda/context_kv_cache_memcpy_kernel.cu rename to extensions/csrc/kernel/cuda/context_kv_cache_memcpy_kernel.cu index f992e6faa..6e05434b8 100644 --- a/extensions/csrc/cuda/context_kv_cache_memcpy_kernel.cu +++ b/extensions/csrc/kernel/cuda/context_kv_cache_memcpy_kernel.cu @@ -2,7 +2,7 @@ #include #include "utils/vec_copy.h" -#include "../common/micros.h" +#include "common/micros.h" using colossalAI::cuda::utils::copy_vector; using colossalAI::cuda::utils::get_vec_size; diff --git a/extensions/csrc/cuda/decode_kv_cache_memcpy_kernel.cu b/extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu similarity index 99% rename from extensions/csrc/cuda/decode_kv_cache_memcpy_kernel.cu rename to extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu index 8eb9fb00f..f29379f5c 100644 --- a/extensions/csrc/cuda/decode_kv_cache_memcpy_kernel.cu +++ b/extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu @@ -2,7 +2,7 @@ #include #include "utils/vec_copy.h" -#include "../common/micros.h" +#include "common/micros.h" using colossalAI::cuda::utils::copy_vector; using colossalAI::cuda::utils::get_vec_size; diff --git a/extensions/csrc/cuda/flash_decoding_attention_kernel.cu b/extensions/csrc/kernel/cuda/flash_decoding_attention_kernel.cu similarity index 97% rename from extensions/csrc/cuda/flash_decoding_attention_kernel.cu rename to extensions/csrc/kernel/cuda/flash_decoding_attention_kernel.cu index 69b50616b..8930ba04c 100644 --- a/extensions/csrc/cuda/flash_decoding_attention_kernel.cu +++ b/extensions/csrc/kernel/cuda/flash_decoding_attention_kernel.cu @@ -7,11 +7,11 @@ #include #include -#include "../common/micros.h" +#include "common/micros.h" #include "funcs/cast_functor.h" #include "funcs/ternary_functor.h" #include "funcs/binary_functor.h" -#include "utils/vec_type_traits.h" +#include "common/vec_type_traits.h" #include "attention/attention_utils.h" #define WARP_SIZE 32 @@ -34,13 +34,13 @@ constexpr unsigned int nextHighestPowerOf2(unsigned int v) { return v; } -using colossalAI::cuda::funcs::BinaryOpType; -using colossalAI::cuda::funcs::CastFunctor; -using colossalAI::cuda::funcs::TernaryOpFunctor; -using colossalAI::cuda::funcs::TernaryOpType; -using colossalAI::cuda::funcs::zero; -using colossalAI::cuda::utils::VecTypeTrait; -using colossalAI::cuda::utils::FloatVecTypeTrait; +using colossalAI::funcs::BinaryOpType; +using colossalAI::funcs::CastFunctor; +using colossalAI::funcs::TernaryOpFunctor; +using colossalAI::funcs::TernaryOpType; +using colossalAI::funcs::zero; +using colossalAI::common::VecTypeTrait; +using colossalAI::common::FloatVecTypeTrait; using namespace colossalAI::cuda::attention; diff --git a/extensions/csrc/cuda/fused_rotary_emb_and_cache_kernel.cu b/extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu similarity index 99% rename from extensions/csrc/cuda/fused_rotary_emb_and_cache_kernel.cu rename to extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu index 29715ca22..52f3588a7 100644 --- a/extensions/csrc/cuda/fused_rotary_emb_and_cache_kernel.cu +++ b/extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu @@ -3,8 +3,8 @@ #include #include "utils/vec_copy.h" -#include "../common/micros.h" -#include "../common/mp_type_traits.h" +#include "common/micros.h" +#include "common/mp_type_traits.h" using colossalAI::cuda::utils::copy_vector; using colossalAI::cuda::utils::get_vec_size; diff --git a/extensions/csrc/cuda/get_cos_and_sin_kernel.cu b/extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu similarity index 99% rename from extensions/csrc/cuda/get_cos_and_sin_kernel.cu rename to extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu index 40db089b2..9c78666e6 100644 --- a/extensions/csrc/cuda/get_cos_and_sin_kernel.cu +++ b/extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu @@ -2,7 +2,7 @@ #include #include "utils/vec_copy.h" -#include "../common/micros.h" +#include "common/micros.h" using colossalAI::cuda::utils::copy_vector; using colossalAI::cuda::utils::get_vec_size; diff --git a/extensions/csrc/cuda/layer_norm_kernel.cu b/extensions/csrc/kernel/cuda/layer_norm_kernel.cu similarity index 99% rename from extensions/csrc/cuda/layer_norm_kernel.cu rename to extensions/csrc/kernel/cuda/layer_norm_kernel.cu index 8239adc9f..cd569f741 100644 --- a/extensions/csrc/cuda/layer_norm_kernel.cu +++ b/extensions/csrc/kernel/cuda/layer_norm_kernel.cu @@ -9,7 +9,7 @@ #include "ATen/AccumulateType.h" #include "ATen/cuda/CUDAContext.h" #include "ATen/cuda/DeviceUtils.cuh" -#include "../common/micros.h" +#include "common/micros.h" template __device__ void cuWelfordOnlineSum(const U curr, U& mu, U& sigma2, U& count) { diff --git a/extensions/csrc/cuda/moe_kernel.cu b/extensions/csrc/kernel/cuda/moe_kernel.cu similarity index 98% rename from extensions/csrc/cuda/moe_kernel.cu rename to extensions/csrc/kernel/cuda/moe_kernel.cu index a60932c76..ff7480086 100644 --- a/extensions/csrc/cuda/moe_kernel.cu +++ b/extensions/csrc/kernel/cuda/moe_kernel.cu @@ -6,9 +6,8 @@ #include "funcs/reduce_function.h" - -using colossalAI::cuda::funcs::block_reduce; -using colossalAI::cuda::funcs::ReduceType; +using colossalAI::funcs::block_reduce; +using colossalAI::funcs::ReduceType; template __device__ void moe_dpch_one_fwd(T *src_row, T *dst_row, const int cols) { @@ -540,7 +539,7 @@ void cumsum_launch(int *inputs, int *outputs, const int s, const int e) { // API FUNCTIONS -------------------------------- -#define DISPATCH_FLOAT_AND_HALF(TYPE, NAME, ...) \ +#define DISPATCH_FLOAT_AND_HALF_MOE(TYPE, NAME, ...) \ switch (TYPE) { \ case at::ScalarType::Float: { \ using scalar_t = float; \ @@ -566,7 +565,7 @@ torch::Tensor moe_dispatch_cuda_forward(int s, int ec, int h, torch::dtype(batch_tokens.dtype()).device(batch_tokens.device())); auto k = mask.size(0); - DISPATCH_FLOAT_AND_HALF( + DISPATCH_FLOAT_AND_HALF_MOE( batch_tokens.scalar_type(), "moe dispatch forward", moe_dpch_fwd_launch( batch_tokens.data_ptr(), res.data_ptr(), @@ -586,7 +585,7 @@ torch::Tensor moe_dispatch_cuda_backward(int s, int ec, int h, {s, h}, torch::dtype(expert_grad.dtype()).device(expert_grad.device())); auto k = mask.size(0); - DISPATCH_FLOAT_AND_HALF( + DISPATCH_FLOAT_AND_HALF_MOE( expert_grad.scalar_type(), "moe dispatch backward", moe_dpch_bwd_launch( res.data_ptr(), expert_grad.data_ptr(), @@ -609,7 +608,7 @@ torch::Tensor moe_combine_cuda_forward(int s, int e, int c, int h, torch::dtype(expert_tokens.dtype()).device(expert_tokens.device())); auto k = mask.size(0); - DISPATCH_FLOAT_AND_HALF( + DISPATCH_FLOAT_AND_HALF_MOE( expert_tokens.scalar_type(), "moe combine forward", moe_cb_fwd_launch( expert_tokens.data_ptr(), res.data_ptr(), @@ -636,7 +635,7 @@ std::vector moe_combine_cuda_backward( {s, e}, torch::dtype(logits.dtype()).device(logits.device())); auto k = mask.size(0); - DISPATCH_FLOAT_AND_HALF( + DISPATCH_FLOAT_AND_HALF_MOE( tokens_grad.scalar_type(), "moe combine backward", moe_cb_bwd_launch( tokens_grad.data_ptr(), egrad.data_ptr(), diff --git a/extensions/csrc/cuda/multi_tensor_adam_kernel.cu b/extensions/csrc/kernel/cuda/multi_tensor_adam_kernel.cu similarity index 99% rename from extensions/csrc/cuda/multi_tensor_adam_kernel.cu rename to extensions/csrc/kernel/cuda/multi_tensor_adam_kernel.cu index b7793b364..e0c2f0b4c 100644 --- a/extensions/csrc/cuda/multi_tensor_adam_kernel.cu +++ b/extensions/csrc/kernel/cuda/multi_tensor_adam_kernel.cu @@ -15,7 +15,7 @@ #include #include "multi_tensor_apply.cuh" -#include "../common/micros.h" +#include "common/micros.h" #define BLOCK_SIZE 512 #define ILP 4 diff --git a/extensions/csrc/cuda/multi_tensor_apply.cuh b/extensions/csrc/kernel/cuda/multi_tensor_apply.cuh similarity index 99% rename from extensions/csrc/cuda/multi_tensor_apply.cuh rename to extensions/csrc/kernel/cuda/multi_tensor_apply.cuh index 799ccfa73..8c98687ce 100644 --- a/extensions/csrc/cuda/multi_tensor_apply.cuh +++ b/extensions/csrc/kernel/cuda/multi_tensor_apply.cuh @@ -12,7 +12,7 @@ #include #include -#include "../common/micros.h" +#include "common/micros.h" // #include diff --git a/extensions/csrc/cuda/multi_tensor_l2norm_kernel.cu b/extensions/csrc/kernel/cuda/multi_tensor_l2norm_kernel.cu similarity index 99% rename from extensions/csrc/cuda/multi_tensor_l2norm_kernel.cu rename to extensions/csrc/kernel/cuda/multi_tensor_l2norm_kernel.cu index d2e0f8734..3596aa3d5 100644 --- a/extensions/csrc/cuda/multi_tensor_l2norm_kernel.cu +++ b/extensions/csrc/kernel/cuda/multi_tensor_l2norm_kernel.cu @@ -11,8 +11,7 @@ #include #include "multi_tensor_apply.cuh" -#include "../common/micros.h" -#include "funcs/reduce_function.h" +#include "common/micros.h" #define BLOCK_SIZE 512 #define ILP 4 diff --git a/extensions/csrc/cuda/multi_tensor_lamb_kernel.cu b/extensions/csrc/kernel/cuda/multi_tensor_lamb_kernel.cu similarity index 99% rename from extensions/csrc/cuda/multi_tensor_lamb_kernel.cu rename to extensions/csrc/kernel/cuda/multi_tensor_lamb_kernel.cu index 82c02f36d..05b3d1199 100644 --- a/extensions/csrc/cuda/multi_tensor_lamb_kernel.cu +++ b/extensions/csrc/kernel/cuda/multi_tensor_lamb_kernel.cu @@ -10,7 +10,7 @@ #include #include "multi_tensor_apply.cuh" -#include "../common/micros.h" +#include "common/micros.h" #define BLOCK_SIZE 512 #define ILP 4 diff --git a/extensions/csrc/cuda/multi_tensor_scale_kernel.cu b/extensions/csrc/kernel/cuda/multi_tensor_scale_kernel.cu similarity index 99% rename from extensions/csrc/cuda/multi_tensor_scale_kernel.cu rename to extensions/csrc/kernel/cuda/multi_tensor_scale_kernel.cu index 0dec1d5d1..a84c93c3b 100644 --- a/extensions/csrc/cuda/multi_tensor_scale_kernel.cu +++ b/extensions/csrc/kernel/cuda/multi_tensor_scale_kernel.cu @@ -10,7 +10,7 @@ #include #include "multi_tensor_apply.cuh" -#include "../common/micros.h" +#include "common/micros.h" #define BLOCK_SIZE 512 #define ILP 4 diff --git a/extensions/csrc/cuda/multi_tensor_sgd_kernel.cu b/extensions/csrc/kernel/cuda/multi_tensor_sgd_kernel.cu similarity index 99% rename from extensions/csrc/cuda/multi_tensor_sgd_kernel.cu rename to extensions/csrc/kernel/cuda/multi_tensor_sgd_kernel.cu index d0cf786f8..d48bb7053 100644 --- a/extensions/csrc/cuda/multi_tensor_sgd_kernel.cu +++ b/extensions/csrc/kernel/cuda/multi_tensor_sgd_kernel.cu @@ -7,7 +7,7 @@ #include #include -#include "../common/micros.h" +#include "common/micros.h" #include "multi_tensor_apply.cuh" #define BLOCK_SIZE 512 diff --git a/extensions/csrc/cuda/rms_layernorm_kernel.cu b/extensions/csrc/kernel/cuda/rms_layernorm_kernel.cu similarity index 97% rename from extensions/csrc/cuda/rms_layernorm_kernel.cu rename to extensions/csrc/kernel/cuda/rms_layernorm_kernel.cu index f109edca4..0cd330b5f 100644 --- a/extensions/csrc/cuda/rms_layernorm_kernel.cu +++ b/extensions/csrc/kernel/cuda/rms_layernorm_kernel.cu @@ -7,18 +7,18 @@ #include -#include "../common/micros.h" +#include "common/micros.h" #include "funcs/cast_functor.h" #include "funcs/binary_functor.h" #include "funcs/reduce_function.h" -#include "utils/vec_type_traits.h" - -using colossalAI::cuda::funcs::block_reduce; -using colossalAI::cuda::funcs::ReduceType; -using colossalAI::cuda::funcs::CastFunctor; -using colossalAI::cuda::funcs::BinaryOpFunctor; -using colossalAI::cuda::funcs::BinaryOpType; -using colossalAI::cuda::utils::VecTypeTrait; +#include "common/vec_type_traits.h" + +using colossalAI::funcs::block_reduce; +using colossalAI::funcs::ReduceType; +using colossalAI::funcs::CastFunctor; +using colossalAI::funcs::BinaryOpFunctor; +using colossalAI::funcs::BinaryOpType; +using colossalAI::common::VecTypeTrait; #define RMSNORM_LAUNCHER(UNROLL_FACTOR, THREADDIM) \ DISPATCH_RMSNORM_FLOAT_HALF_AND_BFLOAT( \ diff --git a/extensions/csrc/cuda/scaled_masked_softmax_kernel.cu b/extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu similarity index 99% rename from extensions/csrc/cuda/scaled_masked_softmax_kernel.cu rename to extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu index 3e51c4b66..db9a2bbd6 100644 --- a/extensions/csrc/cuda/scaled_masked_softmax_kernel.cu +++ b/extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu @@ -14,15 +14,15 @@ #include #include -#include "../common/micros.h" +#include "common/micros.h" #include "utils/vec_copy.h" #include "funcs/reduce_function.h" #include "funcs/unary_functor.h" -using colossalAI::cuda::funcs::UnaryOpFunctor; -using colossalAI::cuda::funcs::UnaryOpType; -using colossalAI::cuda::funcs::warp_reduce; -using colossalAI::cuda::funcs::ReduceType; +using colossalAI::funcs::UnaryOpFunctor; +using colossalAI::funcs::UnaryOpType; +using colossalAI::funcs::warp_reduce; +using colossalAI::funcs::ReduceType; using colossalAI::cuda::utils::copy_vector; diff --git a/extensions/csrc/cuda/scaled_upper_triang_masked_softmax_kernel.cu b/extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu similarity index 99% rename from extensions/csrc/cuda/scaled_upper_triang_masked_softmax_kernel.cu rename to extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu index 510d98f28..db90916f3 100644 --- a/extensions/csrc/cuda/scaled_upper_triang_masked_softmax_kernel.cu +++ b/extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu @@ -14,15 +14,15 @@ #include #include -#include "../common/micros.h" +#include "common/micros.h" #include "utils/vec_copy.h" #include "funcs/reduce_function.h" #include "funcs/unary_functor.h" -using colossalAI::cuda::funcs::UnaryOpFunctor; -using colossalAI::cuda::funcs::UnaryOpType; -using colossalAI::cuda::funcs::warp_reduce; -using colossalAI::cuda::funcs::ReduceType; +using colossalAI::funcs::UnaryOpFunctor; +using colossalAI::funcs::UnaryOpType; +using colossalAI::funcs::warp_reduce; +using colossalAI::funcs::ReduceType; using colossalAI::cuda::utils::copy_vector; using colossalAI::cuda::utils::copy_zero_vector; diff --git a/extensions/csrc/cuda/utils/gpu_launch_config.h b/extensions/csrc/kernel/cuda/utils/gpu_launch_config.h similarity index 100% rename from extensions/csrc/cuda/utils/gpu_launch_config.h rename to extensions/csrc/kernel/cuda/utils/gpu_launch_config.h diff --git a/extensions/csrc/cuda/utils/micros.h b/extensions/csrc/kernel/cuda/utils/micros.h similarity index 100% rename from extensions/csrc/cuda/utils/micros.h rename to extensions/csrc/kernel/cuda/utils/micros.h diff --git a/extensions/csrc/cuda/utils/nvgpu_dev_info.h b/extensions/csrc/kernel/cuda/utils/nvgpu_dev_info.h similarity index 100% rename from extensions/csrc/cuda/utils/nvgpu_dev_info.h rename to extensions/csrc/kernel/cuda/utils/nvgpu_dev_info.h diff --git a/extensions/csrc/cuda/utils/vec_copy.h b/extensions/csrc/kernel/cuda/utils/vec_copy.h similarity index 82% rename from extensions/csrc/cuda/utils/vec_copy.h rename to extensions/csrc/kernel/cuda/utils/vec_copy.h index 39e28d268..8fe4e113c 100644 --- a/extensions/csrc/cuda/utils/vec_copy.h +++ b/extensions/csrc/kernel/cuda/utils/vec_copy.h @@ -4,8 +4,8 @@ #include #include -#include "../funcs/cast_functor.h" -#include "vec_type_traits.h" +#include "common/vec_type_traits.h" +#include "funcs/cast_functor.h" namespace colossalAI { namespace cuda { @@ -13,7 +13,7 @@ namespace utils { template __device__ __inline__ void copy_vector(T *dst, const T *src) { - using VT = typename colossalAI::cuda::utils::VecTypeTrait::Type; + using VT = typename common::VecTypeTrait::Type; // Note(LiuYang): Here static_cast can't be used for cast between two pointer *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); } @@ -29,9 +29,8 @@ __device__ __inline__ void copy_vector(float *dst, const float *src) { template __device__ __inline__ void copy_zero_vector(T *dst) { - using VT = typename colossalAI::cuda::utils::VecTypeTrait::Type; - *(reinterpret_cast(dst)) = - colossalAI::cuda::funcs::CastFunctor()(0.0f); + using VT = typename common::VecTypeTrait::Type; + *(reinterpret_cast(dst)) = funcs::CastFunctor()(0.0f); } template diff --git a/extensions/csrc/x86/cpu_adam.cpp b/extensions/csrc/kernel/x86/cpu_adam.cpp similarity index 100% rename from extensions/csrc/x86/cpu_adam.cpp rename to extensions/csrc/kernel/x86/cpu_adam.cpp diff --git a/extensions/csrc/x86/cpu_adam.h b/extensions/csrc/kernel/x86/cpu_adam.h similarity index 100% rename from extensions/csrc/x86/cpu_adam.h rename to extensions/csrc/kernel/x86/cpu_adam.h diff --git a/extensions/cuda_extension.py b/extensions/cuda_extension.py index f1e0095b2..b722057c9 100644 --- a/extensions/cuda_extension.py +++ b/extensions/cuda_extension.py @@ -21,6 +21,7 @@ class _CudaExtension(_CppExtension): """ This function should return a list of nvcc compilation flags for extensions. """ + return ["-DCOLOSSAL_WITH_CUDA"] def is_available(self) -> bool: # cuda extension can only be built if cuda is available @@ -53,6 +54,12 @@ class _CudaExtension(_CppExtension): cuda_include = os.path.join(CUDA_HOME, "include") return cuda_include + def include_dirs(self) -> List[str]: + """ + This function should return a list of include files for extensions. + """ + return super().include_dirs() + [self.get_cuda_home_include()] + def build_jit(self) -> None: from torch.utils.cpp_extension import CUDA_HOME, load diff --git a/extensions/inference/inference_ops_cuda.py b/extensions/inference/inference_ops_cuda.py deleted file mode 100644 index 1ad58f3ea..000000000 --- a/extensions/inference/inference_ops_cuda.py +++ /dev/null @@ -1,36 +0,0 @@ -from ..cuda_extension import _CudaExtension -from ..utils import get_cuda_cc_flag - - -class InferenceOpsCudaExtension(_CudaExtension): - def __init__(self): - super().__init__(name="inference_ops_cuda") - - def sources_files(self): - ret = [ - self.csrc_abs_path(fname) - for fname in [ - "cuda/pybind/inference.cpp", - "cuda/decode_kv_cache_memcpy_kernel.cu", - "cuda/context_kv_cache_memcpy_kernel.cu", - "cuda/fused_rotary_emb_and_cache_kernel.cu", - "cuda/activation_kernel.cu", - "cuda/rms_layernorm_kernel.cu", - "cuda/get_cos_and_sin_kernel.cu", - "cuda/flash_decoding_attention_kernel.cu", - ] - ] - return ret - - def include_dirs(self): - ret = [self.csrc_abs_path("cuda/include"), self.get_cuda_home_include()] - return ret - - def cxx_flags(self): - version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"] - return ["-O3"] + version_dependent_macros - - def nvcc_flags(self): - extra_cuda_flags = ["-lineinfo"] - extra_cuda_flags.extend(get_cuda_cc_flag()) - return ["-O3", "--use_fast_math"] + extra_cuda_flags diff --git a/extensions/pybind/__init__.py b/extensions/pybind/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/extensions/cpu_adam/__init__.py b/extensions/pybind/cpu_adam/__init__.py similarity index 100% rename from extensions/cpu_adam/__init__.py rename to extensions/pybind/cpu_adam/__init__.py diff --git a/extensions/cpu_adam/cpu_adam_arm.py b/extensions/pybind/cpu_adam/cpu_adam_arm.py similarity index 80% rename from extensions/cpu_adam/cpu_adam_arm.py rename to extensions/pybind/cpu_adam/cpu_adam_arm.py index 61c4f3ed0..9595eda69 100644 --- a/extensions/cpu_adam/cpu_adam_arm.py +++ b/extensions/pybind/cpu_adam/cpu_adam_arm.py @@ -1,6 +1,7 @@ import platform +from typing import List -from ..cpp_extension import _CppExtension +from ...cpp_extension import _CppExtension class CpuAdamArmExtension(_CppExtension): @@ -20,12 +21,12 @@ class CpuAdamArmExtension(_CppExtension): # necessary 4 functions def sources_files(self): ret = [ - self.csrc_abs_path("arm/cpu_adam_arm.cpp"), + self.csrc_abs_path("kernel/arm/cpu_adam_arm.cpp"), ] return ret - def include_dirs(self): - return [] + def include_dirs(self) -> List[str]: + return super().include_dirs() def cxx_flags(self): extra_cxx_flags = [ diff --git a/extensions/cpu_adam/cpu_adam_x86.py b/extensions/pybind/cpu_adam/cpu_adam_x86.py similarity index 83% rename from extensions/cpu_adam/cpu_adam_x86.py rename to extensions/pybind/cpu_adam/cpu_adam_x86.py index 4789f2f32..525f3abe1 100644 --- a/extensions/cpu_adam/cpu_adam_x86.py +++ b/extensions/pybind/cpu_adam/cpu_adam_x86.py @@ -1,7 +1,7 @@ import platform -from ..cuda_extension import _CudaExtension -from ..utils import append_nvcc_threads +from ...cuda_extension import _CudaExtension +from ...utils import append_nvcc_threads class CpuAdamX86Extension(_CudaExtension): @@ -21,13 +21,10 @@ class CpuAdamX86Extension(_CudaExtension): # necessary 4 functions def sources_files(self): ret = [ - self.csrc_abs_path("x86/cpu_adam.cpp"), + self.csrc_abs_path("kernel/x86/cpu_adam.cpp"), ] return ret - def include_dirs(self): - return [self.csrc_abs_path("includes"), self.get_cuda_home_include()] - def cxx_flags(self): extra_cxx_flags = [ "-std=c++14", @@ -50,5 +47,5 @@ class CpuAdamX86Extension(_CudaExtension): "-U__CUDA_NO_HALF2_OPERATORS__", "-DTHRUST_IGNORE_CUB_VERSION_CHECK", ] - ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags + ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags + super().nvcc_flags() return append_nvcc_threads(ret) diff --git a/extensions/flash_attention/__init__.py b/extensions/pybind/flash_attention/__init__.py similarity index 100% rename from extensions/flash_attention/__init__.py rename to extensions/pybind/flash_attention/__init__.py diff --git a/extensions/flash_attention/flash_attention_dao_cuda.py b/extensions/pybind/flash_attention/flash_attention_dao_cuda.py similarity index 98% rename from extensions/flash_attention/flash_attention_dao_cuda.py rename to extensions/pybind/flash_attention/flash_attention_dao_cuda.py index a2f2a52f1..a108377a8 100644 --- a/extensions/flash_attention/flash_attention_dao_cuda.py +++ b/extensions/pybind/flash_attention/flash_attention_dao_cuda.py @@ -1,4 +1,4 @@ -from ..base_extension import _Extension +from ...base_extension import _Extension class FlashAttentionDaoCudaExtension(_Extension): diff --git a/extensions/flash_attention/flash_attention_npu.py b/extensions/pybind/flash_attention/flash_attention_npu.py similarity index 97% rename from extensions/flash_attention/flash_attention_npu.py rename to extensions/pybind/flash_attention/flash_attention_npu.py index 0e01cefa1..8a30972b6 100644 --- a/extensions/flash_attention/flash_attention_npu.py +++ b/extensions/pybind/flash_attention/flash_attention_npu.py @@ -1,4 +1,4 @@ -from ..base_extension import _Extension +from ...base_extension import _Extension class FlashAttentionNpuExtension(_Extension): diff --git a/extensions/flash_attention/flash_attention_sdpa_cuda.py b/extensions/pybind/flash_attention/flash_attention_sdpa_cuda.py similarity index 97% rename from extensions/flash_attention/flash_attention_sdpa_cuda.py rename to extensions/pybind/flash_attention/flash_attention_sdpa_cuda.py index d3323a6aa..2f920db61 100644 --- a/extensions/flash_attention/flash_attention_sdpa_cuda.py +++ b/extensions/pybind/flash_attention/flash_attention_sdpa_cuda.py @@ -1,4 +1,4 @@ -from ..base_extension import _Extension +from ...base_extension import _Extension class FlashAttentionSdpaCudaExtension(_Extension): diff --git a/extensions/inference/__init__.py b/extensions/pybind/inference/__init__.py similarity index 100% rename from extensions/inference/__init__.py rename to extensions/pybind/inference/__init__.py diff --git a/extensions/csrc/cuda/pybind/inference.cpp b/extensions/pybind/inference/inference.cpp similarity index 100% rename from extensions/csrc/cuda/pybind/inference.cpp rename to extensions/pybind/inference/inference.cpp diff --git a/extensions/pybind/inference/inference_ops_cuda.py b/extensions/pybind/inference/inference_ops_cuda.py new file mode 100644 index 000000000..b90638d62 --- /dev/null +++ b/extensions/pybind/inference/inference_ops_cuda.py @@ -0,0 +1,31 @@ +from ...cuda_extension import _CudaExtension +from ...utils import get_cuda_cc_flag + + +class InferenceOpsCudaExtension(_CudaExtension): + def __init__(self): + super().__init__(name="inference_ops_cuda") + + def sources_files(self): + ret = [ + self.csrc_abs_path(fname) + for fname in [ + "kernel/cuda/decode_kv_cache_memcpy_kernel.cu", + "kernel/cuda/context_kv_cache_memcpy_kernel.cu", + "kernel/cuda/fused_rotary_emb_and_cache_kernel.cu", + "kernel/cuda/activation_kernel.cu", + "kernel/cuda/rms_layernorm_kernel.cu", + "kernel/cuda/get_cos_and_sin_kernel.cu", + "kernel/cuda/flash_decoding_attention_kernel.cu", + ] + ] + [self.pybind_abs_path("inference/inference.cpp")] + return ret + + def cxx_flags(self): + version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"] + return ["-O3"] + version_dependent_macros + + def nvcc_flags(self): + extra_cuda_flags = ["-lineinfo"] + extra_cuda_flags.extend(get_cuda_cc_flag()) + return ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags() diff --git a/extensions/layernorm/__init__.py b/extensions/pybind/layernorm/__init__.py similarity index 100% rename from extensions/layernorm/__init__.py rename to extensions/pybind/layernorm/__init__.py diff --git a/extensions/csrc/cuda/pybind/layer_norm.cpp b/extensions/pybind/layernorm/layer_norm.cpp similarity index 99% rename from extensions/csrc/cuda/pybind/layer_norm.cpp rename to extensions/pybind/layernorm/layer_norm.cpp index b1f7c2543..77c4e38c8 100644 --- a/extensions/csrc/cuda/pybind/layer_norm.cpp +++ b/extensions/pybind/layernorm/layer_norm.cpp @@ -7,7 +7,7 @@ #include #include -#include "../../common/micros.h" +#include "common/micros.h" namespace { diff --git a/extensions/layernorm/layernorm_cuda.py b/extensions/pybind/layernorm/layernorm_cuda.py similarity index 57% rename from extensions/layernorm/layernorm_cuda.py rename to extensions/pybind/layernorm/layernorm_cuda.py index 36cf73590..951563e7e 100644 --- a/extensions/layernorm/layernorm_cuda.py +++ b/extensions/pybind/layernorm/layernorm_cuda.py @@ -1,5 +1,5 @@ -from ..cuda_extension import _CudaExtension -from ..utils import append_nvcc_threads, get_cuda_cc_flag +from ...cuda_extension import _CudaExtension +from ...utils import append_nvcc_threads, get_cuda_cc_flag class LayerNormCudaExtension(_CudaExtension): @@ -7,11 +7,13 @@ class LayerNormCudaExtension(_CudaExtension): super().__init__(name="layernorm_cuda") def sources_files(self): - ret = [self.csrc_abs_path(fname) for fname in ["cuda/pybind/layer_norm.cpp", "cuda/layer_norm_kernel.cu"]] + ret = [self.csrc_abs_path(fname) for fname in ["kernel/cuda/layer_norm_kernel.cu"]] + [ + self.pybind_abs_path("layernorm/layer_norm.cpp") + ] return ret def include_dirs(self): - ret = [self.get_cuda_home_include()] + ret = [self.get_cuda_home_include()] + [self.csrc_abs_path("")] return ret def cxx_flags(self): @@ -20,5 +22,5 @@ class LayerNormCudaExtension(_CudaExtension): def nvcc_flags(self): extra_cuda_flags = ["-maxrregcount=50"] extra_cuda_flags.extend(get_cuda_cc_flag()) - ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + self.version_dependent_macros + ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + self.version_dependent_macros + super().nvcc_flags() return append_nvcc_threads(ret) diff --git a/extensions/moe/__init__.py b/extensions/pybind/moe/__init__.py similarity index 100% rename from extensions/moe/__init__.py rename to extensions/pybind/moe/__init__.py diff --git a/extensions/csrc/cuda/pybind/moe.cpp b/extensions/pybind/moe/moe.cpp similarity index 100% rename from extensions/csrc/cuda/pybind/moe.cpp rename to extensions/pybind/moe/moe.cpp diff --git a/extensions/moe/moe_cuda.py b/extensions/pybind/moe/moe_cuda.py similarity index 58% rename from extensions/moe/moe_cuda.py rename to extensions/pybind/moe/moe_cuda.py index 7a4744d4d..898ffe21c 100644 --- a/extensions/moe/moe_cuda.py +++ b/extensions/pybind/moe/moe_cuda.py @@ -1,17 +1,15 @@ -from ..cuda_extension import _CudaExtension -from ..utils import append_nvcc_threads, get_cuda_cc_flag +from ...cuda_extension import _CudaExtension +from ...utils import append_nvcc_threads, get_cuda_cc_flag class MoeCudaExtension(_CudaExtension): def __init__(self): super().__init__(name="moe_cuda") - def include_dirs(self): - ret = [self.csrc_abs_path("cuda/include"), self.get_cuda_home_include()] - return ret - def sources_files(self): - ret = [self.csrc_abs_path(fname) for fname in ["cuda/pybind/moe.cpp", "cuda/moe_kernel.cu"]] + ret = [self.csrc_abs_path(fname) for fname in ["kernel/cuda/moe_kernel.cu"]] + [ + self.pybind_abs_path("moe/moe.cpp") + ] return ret def cxx_flags(self): @@ -25,5 +23,5 @@ class MoeCudaExtension(_CudaExtension): "--expt-extended-lambda", ] extra_cuda_flags.extend(get_cuda_cc_flag()) - ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags() return append_nvcc_threads(ret) diff --git a/extensions/optimizer/__init__.py b/extensions/pybind/optimizer/__init__.py similarity index 100% rename from extensions/optimizer/__init__.py rename to extensions/pybind/optimizer/__init__.py diff --git a/extensions/optimizer/fused_optimizer_cuda.py b/extensions/pybind/optimizer/fused_optimizer_cuda.py similarity index 50% rename from extensions/optimizer/fused_optimizer_cuda.py rename to extensions/pybind/optimizer/fused_optimizer_cuda.py index 41c6260aa..13f3281fb 100644 --- a/extensions/optimizer/fused_optimizer_cuda.py +++ b/extensions/pybind/optimizer/fused_optimizer_cuda.py @@ -1,5 +1,5 @@ -from ..cuda_extension import _CudaExtension -from ..utils import get_cuda_cc_flag +from ...cuda_extension import _CudaExtension +from ...utils import get_cuda_cc_flag class FusedOptimizerCudaExtension(_CudaExtension): @@ -10,18 +10,13 @@ class FusedOptimizerCudaExtension(_CudaExtension): ret = [ self.csrc_abs_path(fname) for fname in [ - "cuda/pybind/optimizer.cpp", - "cuda/multi_tensor_sgd_kernel.cu", - "cuda/multi_tensor_scale_kernel.cu", - "cuda/multi_tensor_adam_kernel.cu", - "cuda/multi_tensor_l2norm_kernel.cu", - "cuda/multi_tensor_lamb_kernel.cu", + "kernel/cuda/multi_tensor_sgd_kernel.cu", + "kernel/cuda/multi_tensor_scale_kernel.cu", + "kernel/cuda/multi_tensor_adam_kernel.cu", + "kernel/cuda/multi_tensor_l2norm_kernel.cu", + "kernel/cuda/multi_tensor_lamb_kernel.cu", ] - ] - return ret - - def include_dirs(self): - ret = [self.get_cuda_home_include()] + ] + [self.pybind_abs_path("optimizer/optimizer.cpp")] return ret def cxx_flags(self): @@ -31,4 +26,4 @@ class FusedOptimizerCudaExtension(_CudaExtension): def nvcc_flags(self): extra_cuda_flags = ["-lineinfo"] extra_cuda_flags.extend(get_cuda_cc_flag()) - return ["-O3", "--use_fast_math"] + extra_cuda_flags + return ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags() diff --git a/extensions/csrc/cuda/pybind/optimizer.cpp b/extensions/pybind/optimizer/optimizer.cpp similarity index 100% rename from extensions/csrc/cuda/pybind/optimizer.cpp rename to extensions/pybind/optimizer/optimizer.cpp diff --git a/extensions/softmax/__init__.py b/extensions/pybind/softmax/__init__.py similarity index 100% rename from extensions/softmax/__init__.py rename to extensions/pybind/softmax/__init__.py diff --git a/extensions/csrc/cuda/pybind/scaled_masked_softmax.cpp b/extensions/pybind/softmax/scaled_masked_softmax.cpp similarity index 100% rename from extensions/csrc/cuda/pybind/scaled_masked_softmax.cpp rename to extensions/pybind/softmax/scaled_masked_softmax.cpp diff --git a/extensions/softmax/scaled_masked_softmax_cuda.py b/extensions/pybind/softmax/scaled_masked_softmax_cuda.py similarity index 66% rename from extensions/softmax/scaled_masked_softmax_cuda.py rename to extensions/pybind/softmax/scaled_masked_softmax_cuda.py index 797638c3b..049a8c7b5 100644 --- a/extensions/softmax/scaled_masked_softmax_cuda.py +++ b/extensions/pybind/softmax/scaled_masked_softmax_cuda.py @@ -1,5 +1,5 @@ -from ..cuda_extension import _CudaExtension -from ..utils import append_nvcc_threads +from ...cuda_extension import _CudaExtension +from ...utils import append_nvcc_threads class ScaledMaskedSoftmaxCudaExtension(_CudaExtension): @@ -7,15 +7,11 @@ class ScaledMaskedSoftmaxCudaExtension(_CudaExtension): super().__init__(name="scaled_masked_softmax_cuda") def sources_files(self): - ret = [ - self.csrc_abs_path(fname) - for fname in ["cuda/pybind/scaled_masked_softmax.cpp", "cuda/scaled_masked_softmax_kernel.cu"] + ret = [self.csrc_abs_path(fname) for fname in ["kernel/cuda/scaled_masked_softmax_kernel.cu"]] + [ + self.pybind_abs_path("softmax/scaled_masked_softmax.cpp") ] return ret - def include_dirs(self): - return [self.get_cuda_home_include()] - def cxx_flags(self): return ["-O3"] + self.version_dependent_macros @@ -28,5 +24,5 @@ class ScaledMaskedSoftmaxCudaExtension(_CudaExtension): "-U__CUDA_NO_HALF2_OPERATORS__", "-DTHRUST_IGNORE_CUB_VERSION_CHECK", ] - ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags + ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags + super().nvcc_flags() return append_nvcc_threads(ret) diff --git a/extensions/csrc/cuda/pybind/scaled_upper_triang_masked_softmax.cpp b/extensions/pybind/softmax/scaled_upper_triang_masked_softmax.cpp similarity index 100% rename from extensions/csrc/cuda/pybind/scaled_upper_triang_masked_softmax.cpp rename to extensions/pybind/softmax/scaled_upper_triang_masked_softmax.cpp diff --git a/extensions/softmax/scaled_upper_triangle_masked_softmax_cuda.py b/extensions/pybind/softmax/scaled_upper_triangle_masked_softmax_cuda.py similarity index 65% rename from extensions/softmax/scaled_upper_triangle_masked_softmax_cuda.py rename to extensions/pybind/softmax/scaled_upper_triangle_masked_softmax_cuda.py index d48d542ad..a179c2ac5 100644 --- a/extensions/softmax/scaled_upper_triangle_masked_softmax_cuda.py +++ b/extensions/pybind/softmax/scaled_upper_triangle_masked_softmax_cuda.py @@ -1,22 +1,18 @@ -from ..cuda_extension import _CudaExtension -from ..utils import append_nvcc_threads, get_cuda_cc_flag +from ...cuda_extension import _CudaExtension +from ...utils import append_nvcc_threads, get_cuda_cc_flag class ScaledUpperTriangleMaskedSoftmaxCudaExtension(_CudaExtension): def __init__(self): super().__init__(name="scaled_upper_triangle_masked_softmax_cuda") - def include_dirs(self): - return [self.get_cuda_home_include()] - def sources_files(self): ret = [ self.csrc_abs_path(fname) for fname in [ - "cuda/pybind/scaled_upper_triang_masked_softmax.cpp", - "cuda/scaled_upper_triang_masked_softmax_kernel.cu", + "kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu", ] - ] + ] + [self.pybind_abs_path("softmax/scaled_upper_triang_masked_softmax.cpp")] return ret def cxx_flags(self): @@ -30,5 +26,5 @@ class ScaledUpperTriangleMaskedSoftmaxCudaExtension(_CudaExtension): "--expt-extended-lambda", ] extra_cuda_flags.extend(get_cuda_cc_flag()) - ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags() return append_nvcc_threads(ret)