Browse Source

[Inference/Refactor] Refactor compilation mechanism and unified multi hw (#5613)

* refactor compilation mechanism and unified multi hw

* fix file path bug

* add init.py to make pybind a module to avoid relative path error caused by softlink

* delete duplicated micros

* fix micros bug in gcc
pull/5650/head
傅剑寒 7 months ago committed by GitHub
parent
commit
279300dc5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      examples/language/openmoe/model/modeling_openmoe.py
  2. 18
      extensions/__init__.py
  3. 4
      extensions/cpp_extension.py
  4. 60
      extensions/csrc/common/data_type.h
  5. 10
      extensions/csrc/common/micros.h
  6. 69
      extensions/csrc/common/vec_type_traits.h
  7. 40
      extensions/csrc/funcs/binary_functor.h
  8. 49
      extensions/csrc/funcs/cast_functor.h
  9. 7
      extensions/csrc/funcs/reduce_function.h
  10. 55
      extensions/csrc/funcs/ternary_functor.h
  11. 19
      extensions/csrc/funcs/unary_functor.h
  12. 0
      extensions/csrc/kernel/arm/cpu_adam_arm.cpp
  13. 0
      extensions/csrc/kernel/arm/cpu_adam_arm.h
  14. 10
      extensions/csrc/kernel/cuda/activation_kernel.cu
  15. 26
      extensions/csrc/kernel/cuda/attention/attention_utils.h
  16. 2
      extensions/csrc/kernel/cuda/context_kv_cache_memcpy_kernel.cu
  17. 2
      extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu
  18. 18
      extensions/csrc/kernel/cuda/flash_decoding_attention_kernel.cu
  19. 4
      extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu
  20. 2
      extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu
  21. 2
      extensions/csrc/kernel/cuda/layer_norm_kernel.cu
  22. 15
      extensions/csrc/kernel/cuda/moe_kernel.cu
  23. 2
      extensions/csrc/kernel/cuda/multi_tensor_adam_kernel.cu
  24. 2
      extensions/csrc/kernel/cuda/multi_tensor_apply.cuh
  25. 3
      extensions/csrc/kernel/cuda/multi_tensor_l2norm_kernel.cu
  26. 2
      extensions/csrc/kernel/cuda/multi_tensor_lamb_kernel.cu
  27. 2
      extensions/csrc/kernel/cuda/multi_tensor_scale_kernel.cu
  28. 2
      extensions/csrc/kernel/cuda/multi_tensor_sgd_kernel.cu
  29. 18
      extensions/csrc/kernel/cuda/rms_layernorm_kernel.cu
  30. 10
      extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu
  31. 10
      extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu
  32. 0
      extensions/csrc/kernel/cuda/utils/gpu_launch_config.h
  33. 0
      extensions/csrc/kernel/cuda/utils/micros.h
  34. 0
      extensions/csrc/kernel/cuda/utils/nvgpu_dev_info.h
  35. 11
      extensions/csrc/kernel/cuda/utils/vec_copy.h
  36. 0
      extensions/csrc/kernel/x86/cpu_adam.cpp
  37. 0
      extensions/csrc/kernel/x86/cpu_adam.h
  38. 7
      extensions/cuda_extension.py
  39. 36
      extensions/inference/inference_ops_cuda.py
  40. 0
      extensions/pybind/__init__.py
  41. 0
      extensions/pybind/cpu_adam/__init__.py
  42. 9
      extensions/pybind/cpu_adam/cpu_adam_arm.py
  43. 11
      extensions/pybind/cpu_adam/cpu_adam_x86.py
  44. 0
      extensions/pybind/flash_attention/__init__.py
  45. 2
      extensions/pybind/flash_attention/flash_attention_dao_cuda.py
  46. 2
      extensions/pybind/flash_attention/flash_attention_npu.py
  47. 2
      extensions/pybind/flash_attention/flash_attention_sdpa_cuda.py
  48. 0
      extensions/pybind/inference/__init__.py
  49. 0
      extensions/pybind/inference/inference.cpp
  50. 31
      extensions/pybind/inference/inference_ops_cuda.py
  51. 0
      extensions/pybind/layernorm/__init__.py
  52. 2
      extensions/pybind/layernorm/layer_norm.cpp
  53. 12
      extensions/pybind/layernorm/layernorm_cuda.py
  54. 0
      extensions/pybind/moe/__init__.py
  55. 0
      extensions/pybind/moe/moe.cpp
  56. 14
      extensions/pybind/moe/moe_cuda.py
  57. 0
      extensions/pybind/optimizer/__init__.py
  58. 23
      extensions/pybind/optimizer/fused_optimizer_cuda.py
  59. 0
      extensions/pybind/optimizer/optimizer.cpp
  60. 0
      extensions/pybind/softmax/__init__.py
  61. 0
      extensions/pybind/softmax/scaled_masked_softmax.cpp
  62. 14
      extensions/pybind/softmax/scaled_masked_softmax_cuda.py
  63. 0
      extensions/pybind/softmax/scaled_upper_triang_masked_softmax.cpp
  64. 14
      extensions/pybind/softmax/scaled_upper_triangle_masked_softmax_cuda.py

2
examples/language/openmoe/model/modeling_openmoe.py

@ -35,7 +35,7 @@ from transformers.utils import (
replace_return_docstrings,
)
from colossalai.kernel.extensions.flash_attention import HAS_FLASH_ATTN
from colossalai.kernel.extensions.pybind.flash_attention import HAS_FLASH_ATTN
from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
from colossalai.moe.layers import SparseMLP
from colossalai.moe.manager import MOE_MANAGER

18
extensions/__init__.py

@ -1,10 +1,14 @@
from .cpu_adam import CpuAdamArmExtension, CpuAdamX86Extension
from .flash_attention import FlashAttentionDaoCudaExtension, FlashAttentionNpuExtension, FlashAttentionSdpaCudaExtension
from .inference import InferenceOpsCudaExtension
from .layernorm import LayerNormCudaExtension
from .moe import MoeCudaExtension
from .optimizer import FusedOptimizerCudaExtension
from .softmax import ScaledMaskedSoftmaxCudaExtension, ScaledUpperTriangleMaskedSoftmaxCudaExtension
from .pybind.cpu_adam import CpuAdamArmExtension, CpuAdamX86Extension
from .pybind.flash_attention import (
FlashAttentionDaoCudaExtension,
FlashAttentionNpuExtension,
FlashAttentionSdpaCudaExtension,
)
from .pybind.inference import InferenceOpsCudaExtension
from .pybind.layernorm import LayerNormCudaExtension
from .pybind.moe import MoeCudaExtension
from .pybind.optimizer import FusedOptimizerCudaExtension
from .pybind.softmax import ScaledMaskedSoftmaxCudaExtension, ScaledUpperTriangleMaskedSoftmaxCudaExtension
ALL_EXTENSIONS = [
CpuAdamArmExtension,

4
extensions/cpp_extension.py

@ -25,6 +25,9 @@ class _CppExtension(_Extension):
def csrc_abs_path(self, path):
return os.path.join(self.relative_to_abs_path("csrc"), path)
def pybind_abs_path(self, path):
return os.path.join(self.relative_to_abs_path("pybind"), path)
def relative_to_abs_path(self, code_path: str) -> str:
"""
This function takes in a path relative to the colossalai root directory and return the absolute path.
@ -116,6 +119,7 @@ class _CppExtension(_Extension):
"""
This function should return a list of include files for extensions.
"""
return [self.csrc_abs_path("")]
@abstractmethod
def cxx_flags(self) -> List[str]:

60
extensions/csrc/common/data_type.h

@ -0,0 +1,60 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
namespace colossalAI {
namespace dtype {
struct bfloat164 {
#ifdef COLOSSAL_WITH_CUDA
__nv_bfloat162 x;
__nv_bfloat162 y;
#endif
};
struct bfloat168 {
#ifdef COLOSSAL_WITH_CUDA
__nv_bfloat162 x;
__nv_bfloat162 y;
__nv_bfloat162 z;
__nv_bfloat162 w;
#endif
};
struct half4 {
#ifdef COLOSSAL_WITH_CUDA
half2 x;
half2 y;
#endif
};
struct half8 {
#ifdef COLOSSAL_WITH_CUDA
half2 x;
half2 y;
half2 z;
half2 w;
#endif
};
struct float4_ {
#ifdef COLOSSAL_WITH_CUDA
float2 x;
float2 y;
#endif
};
struct float8_ {
#ifdef COLOSSAL_WITH_CUDA
float2 x;
float2 y;
float2 z;
float2 w;
#endif
};
} // namespace dtype
} // namespace colossalAI

10
extensions/csrc/common/micros.h

@ -222,3 +222,13 @@
AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), \
"'"); \
}
#if defined(COLOSSAL_WITH_CUDA)
#define HOST __host__
#define DEVICE __device__
#define HOSTDEVICE __host__ __device__
#else
#define HOST
#define DEVICE
#define HOSTDEVICE
#endif

69
extensions/csrc/cuda/utils/vec_type_traits.h → extensions/csrc/common/vec_type_traits.h

@ -1,48 +1,16 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <stdint.h>
#include <torch/extension.h>
#include <cfloat>
#include "common/data_type.h"
namespace colossalAI {
namespace cuda {
namespace utils {
struct bfloat164 {
__nv_bfloat162 x;
__nv_bfloat162 y;
};
struct bfloat168 {
__nv_bfloat162 x;
__nv_bfloat162 y;
__nv_bfloat162 z;
__nv_bfloat162 w;
};
struct half4 {
half2 x;
half2 y;
};
struct half8 {
half2 x;
half2 y;
half2 z;
half2 w;
};
struct float4_ {
float2 x;
float2 y;
};
struct float8_ {
float2 x;
float2 y;
float2 z;
float2 w;
};
namespace common {
template <typename T, int VecSize>
struct VecTypeTrait {};
@ -57,6 +25,8 @@ struct FloatVecTypeTrait {};
};
VEC_TYPE_TRAITS_SPECIALIZATION(T, 1, T, typename T)
#if defined(COLOSSAL_WITH_CUDA)
VEC_TYPE_TRAITS_SPECIALIZATION(at::BFloat16, 1, __nv_bfloat16)
VEC_TYPE_TRAITS_SPECIALIZATION(at::BFloat16, 2, __nv_bfloat162)
VEC_TYPE_TRAITS_SPECIALIZATION(at::BFloat16, 4, float2)
@ -67,16 +37,17 @@ VEC_TYPE_TRAITS_SPECIALIZATION(at::Half, 4, float2)
VEC_TYPE_TRAITS_SPECIALIZATION(at::Half, 8, float4)
VEC_TYPE_TRAITS_SPECIALIZATION(float, 2, float2)
VEC_TYPE_TRAITS_SPECIALIZATION(float, 4, float4)
VEC_TYPE_TRAITS_SPECIALIZATION(float, 8, float8_)
VEC_TYPE_TRAITS_SPECIALIZATION(float, 8, dtype::float8_)
VEC_TYPE_TRAITS_SPECIALIZATION(uint8_t, 2, half)
VEC_TYPE_TRAITS_SPECIALIZATION(uint8_t, 4, half2)
VEC_TYPE_TRAITS_SPECIALIZATION(uint8_t, 8, float2)
VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 2, __nv_bfloat162);
VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 4, bfloat164);
VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 8, bfloat168);
VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 4, dtype::bfloat164);
VEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat16, 8, dtype::bfloat168);
VEC_TYPE_TRAITS_SPECIALIZATION(half, 2, half2);
VEC_TYPE_TRAITS_SPECIALIZATION(half, 4, half4);
VEC_TYPE_TRAITS_SPECIALIZATION(half, 8, half8);
VEC_TYPE_TRAITS_SPECIALIZATION(half, 4, dtype::half4);
VEC_TYPE_TRAITS_SPECIALIZATION(half, 8, dtype::half8);
#endif /* defined(COLOSSAL_WITH_CUDA) */
#undef VEC_TYPE_TRAITS_SPECIALIZATION
@ -86,17 +57,17 @@ VEC_TYPE_TRAITS_SPECIALIZATION(half, 8, half8);
using Type = FLOATT; \
};
#if defined(COLOSSAL_WITH_CUDA)
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(float2, float2)
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(float4, float4)
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(__nv_bfloat162, float2);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(bfloat164, float4_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(bfloat168, float8_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::bfloat164, dtype::float4_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::bfloat168, dtype::float8_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(half2, float2);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(half4, float4_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(half8, float8_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::half4, dtype::float4_);
FLOATVEC_TYPE_TRAITS_SPECIALIZATION(dtype::half8, dtype::float8_);
#endif /* COLOSSAL_WITH_CUDA */
#undef FLOATVEC_TYPE_TRAITS_SPECIALIZATION
} // namespace utils
} // namespace cuda
} // namespace common
} // namespace colossalAI

40
extensions/csrc/cuda/funcs/binary_functor.h → extensions/csrc/funcs/binary_functor.h

@ -1,27 +1,21 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#endif
#include <functional>
#include "../utils/micros.h"
#include "../utils/vec_type_traits.h"
#include "cast_functor.h"
#include "common/data_type.h"
#include "common/micros.h"
namespace colossalAI {
namespace cuda {
namespace funcs {
using utils::bfloat164;
using utils::bfloat168;
using utils::float4_;
using utils::float8_;
using utils::half4;
using utils::half8;
enum class BinaryOpType { kAdd = 0, kMinus, kMul, kDiv, kMax, kMin };
// Note(LiuYang): This file provides base math operation for data type
@ -61,6 +55,7 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(T, T, T, BinaryOpType::kMin, HOSTDEVICE,
STMTS_WRAPPER({ return min(lhs, rhs); }),
typename T)
#if defined(COLOSSAL_WITH_CUDA)
COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(half, half, half, BinaryOpType::kAdd,
DEVICE, STMTS_WRAPPER({
return __hadd(lhs, rhs);
@ -151,8 +146,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
}))
COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
bfloat164, bfloat164, float4_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({
float4_ fc;
dtype::bfloat164, dtype::bfloat164, dtype::float4_, BinaryOpType::kMul,
DEVICE, STMTS_WRAPPER({
dtype::float4_ fc;
BinaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2,
BinaryOpType::kMul>
mul;
@ -162,8 +158,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
}))
COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
bfloat168, bfloat168, float8_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({
float8_ fc;
dtype::bfloat168, dtype::bfloat168, dtype::float8_, BinaryOpType::kMul,
DEVICE, STMTS_WRAPPER({
dtype::float8_ fc;
BinaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2,
BinaryOpType::kMul>
mul;
@ -184,8 +181,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
}))
COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
half4, half4, float4_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({
float4_ fc;
dtype::half4, dtype::half4, dtype::float4_, BinaryOpType::kMul, DEVICE,
STMTS_WRAPPER({
dtype::float4_ fc;
BinaryOpFunctor<half2, half2, float2, BinaryOpType::kMul> mul;
fc.x = mul(lhs.x, rhs.x);
fc.y = mul(lhs.y, rhs.y);
@ -193,8 +191,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
}))
COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
half8, half8, float8_, BinaryOpType::kMul, DEVICE, STMTS_WRAPPER({
float8_ fc;
dtype::half8, dtype::half8, dtype::float8_, BinaryOpType::kMul, DEVICE,
STMTS_WRAPPER({
dtype::float8_ fc;
BinaryOpFunctor<half2, half2, float2, BinaryOpType::kMul> mul;
fc.x = mul(lhs.x, rhs.x);
fc.y = mul(lhs.y, rhs.y);
@ -203,10 +202,9 @@ COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION(
return fc;
}))
#undef COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION
#endif /* defined(COLOSSAL_WITH_CUDA) */
#undef COLOSSAL_BINARY_FUNCTOR_SPECIALIZATION
#undef STMTS_WRAPPER
} // namespace funcs
} // namespace cuda
} // namespace colossalAI

49
extensions/csrc/cuda/funcs/cast_functor.h → extensions/csrc/funcs/cast_functor.h

@ -1,29 +1,23 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#endif
#include <functional>
#include "../utils/micros.h"
#include "../utils/vec_type_traits.h"
#include "common/data_type.h"
#include "common/micros.h"
// Note(LiuYang): This file provides base math operation for data type
// include POD and cuda built-in type such as half and __nv_bfloat16
namespace colossalAI {
namespace cuda {
namespace funcs {
using utils::bfloat164;
using utils::bfloat168;
using utils::float4_;
using utils::float8_;
using utils::half4;
using utils::half8;
template <typename From, typename To>
struct CastFunctor : public std::unary_function<From, To> {
HOSTDEVICE To operator()(From val) { return static_cast<To>(val); }
@ -36,6 +30,7 @@ struct CastFunctor : public std::unary_function<From, To> {
FUNCTION_MODIFIER TO operator()(FROM val) STMTS \
};
#if defined(COLOSSAL_WITH_CUDA)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
int2, float2, { return make_float2(val.x, val.y); }, DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
@ -54,27 +49,27 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
half, float, { return __half2float(val); }, DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float4, half4,
float4, dtype::half4,
{
half4 dst;
dtype::half4 dst;
dst.x = __floats2half2_rn(val.x, val.y);
dst.y = __floats2half2_rn(val.z, val.w);
return dst;
},
DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float4_, half4,
dtype::float4_, dtype::half4,
{
half4 dst;
dtype::half4 dst;
dst.x = __float22half2_rn(val.x);
dst.y = __float22half2_rn(val.y);
return dst;
},
DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float8_, half8,
dtype::float8_, dtype::half8,
{
half8 dst;
dtype::half8 dst;
dst.x = __float22half2_rn(val.x);
dst.y = __float22half2_rn(val.y);
dst.z = __float22half2_rn(val.z);
@ -88,9 +83,9 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float, __nv_bfloat16, { return __float2bfloat16_rn(val); }, DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float4, bfloat164,
float4, dtype::bfloat164,
{
bfloat164 dst;
dtype::bfloat164 dst;
dst.x = __floats2bfloat162_rn(val.x, val.y);
dst.y = __floats2bfloat162_rn(val.z, val.w);
return dst;
@ -105,18 +100,18 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float2, __nv_bfloat162, { return __float22bfloat162_rn(val); }, DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float4_, bfloat164,
dtype::float4_, dtype::bfloat164,
{
bfloat164 dst;
dtype::bfloat164 dst;
dst.x = __float22bfloat162_rn(val.x);
dst.y = __float22bfloat162_rn(val.y);
return dst;
},
DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float8_, bfloat168,
dtype::float8_, dtype::bfloat168,
{
bfloat168 dst;
dtype::bfloat168 dst;
dst.x = __float22bfloat162_rn(val.x);
dst.y = __float22bfloat162_rn(val.y);
dst.z = __float22bfloat162_rn(val.z);
@ -141,18 +136,18 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float2, __nv_bfloat162, { return __floats2bfloat162_rn(val.x, val.y); },
DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float4_, bfloat164,
dtype::float4_, dtype::bfloat164,
{
bfloat164 dst;
dtype::bfloat164 dst;
dst.x = __floats2bfloat162_rn(val.x.x, val.x.y);
dst.y = __floats2bfloat162_rn(val.y.x, val.y.y);
return dst;
},
DEVICE)
COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
float8_, bfloat168,
dtype::float8_, dtype::bfloat168,
{
bfloat168 dst;
dtype::bfloat168 dst;
dst.x = __floats2bfloat162_rn(val.x.x, val.x.y);
dst.y = __floats2bfloat162_rn(val.y.x, val.y.y);
dst.z = __floats2bfloat162_rn(val.z.x, val.z.y);
@ -161,8 +156,8 @@ COLOSSAL_CAST_FUNCTOR_SPECIALIZATION(
},
DEVICE)
#endif /* defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 */
#endif /* defined(COLOSSAL_WITH_CUDA) */
#undef COLOSSAL_CAST_FUNCTOR_SPECIALIZATION
} // namespace funcs
} // namespace cuda
} // namespace colossalAI

7
extensions/csrc/cuda/funcs/reduce_function.h → extensions/csrc/funcs/reduce_function.h

@ -1,13 +1,13 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include "../funcs/binary_functor.h"
#include "binary_functor.h"
namespace colossalAI {
namespace cuda {
namespace funcs {
const float kReduceFloatInfNeg = -100000000.f;
@ -89,5 +89,6 @@ __forceinline__ __device__ void block_reduce(T* pval) {
#undef COLOSSAL_BLOCK_REDUCE_IMPL
} // namespace funcs
} // namespace cuda
} // namespace colossalAI
#endif /* defined(COLOSSAL_WITH_CUDA) */

55
extensions/csrc/cuda/funcs/ternary_functor.h → extensions/csrc/funcs/ternary_functor.h

@ -1,18 +1,20 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#endif
#include <float.h>
#include <functional>
#include "../funcs/cast_functor.h"
#include "../utils/micros.h"
#include "cast_functor.h"
#include "common/micros.h"
namespace colossalAI {
namespace cuda {
namespace funcs {
enum class TernaryOpType { kFma = 0 };
@ -29,6 +31,7 @@ struct TernaryOpFunctor;
FUNCTION_MODIFIER RET operator()(LT a, RT b, RET c) STMTS \
};
#if defined(COLOSSAL_WITH_CUDA)
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(float, float, float,
TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
@ -91,16 +94,18 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fma(cast(a), b, c);
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
half4, half4, float4_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({
float4_ fd;
dtype::half4, dtype::half4, dtype::float4_, TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
dtype::float4_ fd;
TernaryOpFunctor<half2, half2, float2, TernaryOpType::kFma> fma;
fd.x = fma(a.x, b.x, c.x);
fd.y = fma(a.y, b.y, c.y);
return fd;
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
half, half4, float4_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({
float4_ fd;
half, dtype::half4, dtype::float4_, TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
dtype::float4_ fd;
CastFunctor<half, half2> cast;
TernaryOpFunctor<half2, half2, float2, TernaryOpType::kFma> fma;
half2 s = cast(a);
@ -109,8 +114,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fd;
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
half8, half8, float8_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({
float8_ fd;
dtype::half8, dtype::half8, dtype::float8_, TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
dtype::float8_ fd;
TernaryOpFunctor<half2, half2, float2, TernaryOpType::kFma> fma;
fd.x = fma(a.x, b.x, c.x);
fd.y = fma(a.y, b.y, c.y);
@ -119,8 +125,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fd;
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
half, half8, float8_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({
float8_ fd;
half, dtype::half8, dtype::float8_, TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
dtype::float8_ fd;
CastFunctor<half, half2> cast;
TernaryOpFunctor<half2, half2, float2, TernaryOpType::kFma> fma;
half2 s = cast(a);
@ -153,8 +160,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fma(cast(a), b, c);
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
bfloat164, bfloat164, float4_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({
float4_ fd;
dtype::bfloat164, dtype::bfloat164, dtype::float4_, TernaryOpType::kFma,
DEVICE, STMTS_WRAPPER({
dtype::float4_ fd;
TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2,
TernaryOpType::kFma>
fma;
@ -163,9 +171,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fd;
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
__nv_bfloat16, bfloat164, float4_, TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
float4_ fd;
__nv_bfloat16, dtype::bfloat164, dtype::float4_, TernaryOpType::kFma,
DEVICE, STMTS_WRAPPER({
dtype::float4_ fd;
CastFunctor<__nv_bfloat16, __nv_bfloat162> cast;
TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2,
TernaryOpType::kFma>
@ -176,8 +184,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fd;
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
bfloat168, bfloat168, float8_, TernaryOpType::kFma, DEVICE, STMTS_WRAPPER({
float8_ fd;
dtype::bfloat168, dtype::bfloat168, dtype::float8_, TernaryOpType::kFma,
DEVICE, STMTS_WRAPPER({
dtype::float8_ fd;
TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2,
TernaryOpType::kFma>
fma;
@ -188,9 +197,9 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fd;
}))
COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
__nv_bfloat16, bfloat168, float8_, TernaryOpType::kFma, DEVICE,
STMTS_WRAPPER({
float8_ fd;
__nv_bfloat16, dtype::bfloat168, dtype::float8_, TernaryOpType::kFma,
DEVICE, STMTS_WRAPPER({
dtype::float8_ fd;
CastFunctor<__nv_bfloat16, __nv_bfloat162> cast;
TernaryOpFunctor<__nv_bfloat162, __nv_bfloat162, float2,
TernaryOpType::kFma>
@ -203,10 +212,10 @@ COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION(
return fd;
}))
#undef COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION
#endif /* defined(COLOSSAL_WITH_CUDA) */
#undef COLOSSAL_TERNARY_FUNCTOR_SPECIALIZATION
#undef STMTS_WRAPPER
} // namespace funcs
} // namespace cuda
} // namespace colossalAI

19
extensions/csrc/cuda/funcs/unary_functor.h → extensions/csrc/funcs/unary_functor.h

@ -1,16 +1,18 @@
#pragma once
#if defined(COLOSSAL_WITH_CUDA)
#include <cuda.h>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#endif
#include <functional>
#include "../utils/micros.h"
#include "common/data_type.h"
#include "common/micros.h"
namespace colossalAI {
namespace cuda {
namespace funcs {
template <typename T>
@ -57,27 +59,30 @@ COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(int, int, UnaryOpType::kLog2Ceil,
return log2_value;
})
#if defined(COLOSSAL_WITH_CUDA)
COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float2, float, UnaryOpType::kSum, DEVICE,
{ return val.x + val.y; })
COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float4, float, UnaryOpType::kSum, DEVICE,
{ return val.x + val.y + val.z + val.w; })
COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float4_, float, UnaryOpType::kSum, DEVICE,
{
COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(dtype::float4_, float, UnaryOpType::kSum,
DEVICE, {
return val.x.x + val.x.y + val.y.x +
val.y.y;
})
COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(float8_, float, UnaryOpType::kSum, DEVICE,
{
COLOSSAL_UNARY_FUNCTOR_SPECIALIZATION(dtype::float8_, float, UnaryOpType::kSum,
DEVICE, {
return val.x.x + val.x.y + val.y.x +
val.y.y + val.z.x + val.z.y +
val.w.x + val.w.y;
})
#endif /* defined(COLOSSAL_WITH_CUDA) */
#undef COLOSSAL_UARY_FUNCTOR_SPECIALIZATION
} // namespace funcs
} // namespace cuda
} // namespace colossalAI

0
extensions/csrc/arm/cpu_adam_arm.cpp → extensions/csrc/kernel/arm/cpu_adam_arm.cpp

0
extensions/csrc/arm/cpu_adam_arm.h → extensions/csrc/kernel/arm/cpu_adam_arm.h

10
extensions/csrc/cuda/activation_kernel.cu → extensions/csrc/kernel/cuda/activation_kernel.cu

@ -2,13 +2,15 @@
#include <torch/extension.h>
#include <stdio.h>
#include "../common/micros.h"
#include "../common/mp_type_traits.h"
#include "common/micros.h"
#include "common/mp_type_traits.h"
using colossalAI::common::MPTypeTrait;
template<typename T>
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
using MT = typename colossalAI::common::MPTypeTrait<T>::Type;
using MT = typename MPTypeTrait<T>::Type;
return static_cast<T>((static_cast<MT>(x)) / (static_cast<MT>(1.0f) + expf(static_cast<MT>(-x))));
}
@ -17,7 +19,7 @@ __global__ void act_and_mul_kernel(
const scalar_t* __restrict__ ins_data,
scalar_t* __restrict__ outs_data,
const int64_t numel) {
using MT = typename colossalAI::common::MPTypeTrait<scalar_t>::Type;
using MT = typename MPTypeTrait<scalar_t>::Type;
int64_t idx = static_cast<int64_t>(threadIdx.x) + static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
const int64_t grid_size = blockDim.x * gridDim.x;

26
extensions/csrc/cuda/attention/attention_utils.h → extensions/csrc/kernel/cuda/attention/attention_utils.h

@ -23,24 +23,16 @@
#include <cuda_fp16.h>
#include <float.h>
#include "../funcs/binary_functor.h"
#include "../funcs/cast_functor.h"
#include "../funcs/ternary_functor.h"
#include "../funcs/unary_functor.h"
#include "../utils/vec_type_traits.h"
#include "common/vec_type_traits.h"
#include "funcs/binary_functor.h"
#include "funcs/cast_functor.h"
#include "funcs/ternary_functor.h"
#include "funcs/unary_functor.h"
namespace colossalAI {
namespace cuda {
namespace attention {
using colossalAI::cuda::funcs::BinaryOpFunctor;
using colossalAI::cuda::funcs::BinaryOpType;
using colossalAI::cuda::funcs::TernaryOpFunctor;
using colossalAI::cuda::funcs::TernaryOpType;
using colossalAI::cuda::funcs::UnaryOpFunctor;
using colossalAI::cuda::funcs::UnaryOpType;
using colossalAI::cuda::utils::FloatVecTypeTrait;
#define WARP_SIZE 32
#define VEC_SIZE_8 8
@ -51,11 +43,11 @@ using colossalAI::cuda::utils::FloatVecTypeTrait;
// Q*K^T operation.
template <int NUM_THREADS_PER_TOKEN, typename VecT, int N>
inline __device__ float qk_dot_(const VecT (&q)[N], const VecT (&k)[N]) {
using A_vec = typename FloatVecTypeTrait<VecT>::Type;
using A_vec = typename common::FloatVecTypeTrait<VecT>::Type;
// Compute the parallel products for Q*K^T (treat vector lanes separately).
BinaryOpFunctor<VecT, VecT, A_vec, BinaryOpType::kMul> mul_vect;
UnaryOpFunctor<A_vec, float, UnaryOpType::kSum> sum_vect;
TernaryOpFunctor<VecT, VecT, A_vec, TernaryOpType::kFma> fma;
funcs::BinaryOpFunctor<VecT, VecT, A_vec, funcs::BinaryOpType::kMul> mul_vect;
funcs::UnaryOpFunctor<A_vec, float, funcs::UnaryOpType::kSum> sum_vect;
funcs::TernaryOpFunctor<VecT, VecT, A_vec, funcs::TernaryOpType::kFma> fma;
A_vec qk_vec = mul_vect(q[0], k[0]);
#pragma unroll

2
extensions/csrc/cuda/context_kv_cache_memcpy_kernel.cu → extensions/csrc/kernel/cuda/context_kv_cache_memcpy_kernel.cu

@ -2,7 +2,7 @@
#include <torch/extension.h>
#include "utils/vec_copy.h"
#include "../common/micros.h"
#include "common/micros.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::get_vec_size;

2
extensions/csrc/cuda/decode_kv_cache_memcpy_kernel.cu → extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu

@ -2,7 +2,7 @@
#include <torch/extension.h>
#include "utils/vec_copy.h"
#include "../common/micros.h"
#include "common/micros.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::get_vec_size;

18
extensions/csrc/cuda/flash_decoding_attention_kernel.cu → extensions/csrc/kernel/cuda/flash_decoding_attention_kernel.cu

@ -7,11 +7,11 @@
#include <c10/cuda/CUDAGuard.h>
#include <stdio.h>
#include "../common/micros.h"
#include "common/micros.h"
#include "funcs/cast_functor.h"
#include "funcs/ternary_functor.h"
#include "funcs/binary_functor.h"
#include "utils/vec_type_traits.h"
#include "common/vec_type_traits.h"
#include "attention/attention_utils.h"
#define WARP_SIZE 32
@ -34,13 +34,13 @@ constexpr unsigned int nextHighestPowerOf2(unsigned int v) {
return v;
}
using colossalAI::cuda::funcs::BinaryOpType;
using colossalAI::cuda::funcs::CastFunctor;
using colossalAI::cuda::funcs::TernaryOpFunctor;
using colossalAI::cuda::funcs::TernaryOpType;
using colossalAI::cuda::funcs::zero;
using colossalAI::cuda::utils::VecTypeTrait;
using colossalAI::cuda::utils::FloatVecTypeTrait;
using colossalAI::funcs::BinaryOpType;
using colossalAI::funcs::CastFunctor;
using colossalAI::funcs::TernaryOpFunctor;
using colossalAI::funcs::TernaryOpType;
using colossalAI::funcs::zero;
using colossalAI::common::VecTypeTrait;
using colossalAI::common::FloatVecTypeTrait;
using namespace colossalAI::cuda::attention;

4
extensions/csrc/cuda/fused_rotary_emb_and_cache_kernel.cu → extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu

@ -3,8 +3,8 @@
#include <torch/extension.h>
#include "utils/vec_copy.h"
#include "../common/micros.h"
#include "../common/mp_type_traits.h"
#include "common/micros.h"
#include "common/mp_type_traits.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::get_vec_size;

2
extensions/csrc/cuda/get_cos_and_sin_kernel.cu → extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu

@ -2,7 +2,7 @@
#include <torch/extension.h>
#include "utils/vec_copy.h"
#include "../common/micros.h"
#include "common/micros.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::get_vec_size;

2
extensions/csrc/cuda/layer_norm_kernel.cu → extensions/csrc/kernel/cuda/layer_norm_kernel.cu

@ -9,7 +9,7 @@
#include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDAContext.h"
#include "ATen/cuda/DeviceUtils.cuh"
#include "../common/micros.h"
#include "common/micros.h"
template <typename U>
__device__ void cuWelfordOnlineSum(const U curr, U& mu, U& sigma2, U& count) {

15
extensions/csrc/cuda/moe_kernel.cu → extensions/csrc/kernel/cuda/moe_kernel.cu

@ -6,9 +6,8 @@
#include "funcs/reduce_function.h"
using colossalAI::cuda::funcs::block_reduce;
using colossalAI::cuda::funcs::ReduceType;
using colossalAI::funcs::block_reduce;
using colossalAI::funcs::ReduceType;
template <typename T, int block_size, int pack_size>
__device__ void moe_dpch_one_fwd(T *src_row, T *dst_row, const int cols) {
@ -540,7 +539,7 @@ void cumsum_launch(int *inputs, int *outputs, const int s, const int e) {
// API FUNCTIONS --------------------------------
#define DISPATCH_FLOAT_AND_HALF(TYPE, NAME, ...) \
#define DISPATCH_FLOAT_AND_HALF_MOE(TYPE, NAME, ...) \
switch (TYPE) { \
case at::ScalarType::Float: { \
using scalar_t = float; \
@ -566,7 +565,7 @@ torch::Tensor moe_dispatch_cuda_forward(int s, int ec, int h,
torch::dtype(batch_tokens.dtype()).device(batch_tokens.device()));
auto k = mask.size(0);
DISPATCH_FLOAT_AND_HALF(
DISPATCH_FLOAT_AND_HALF_MOE(
batch_tokens.scalar_type(), "moe dispatch forward",
moe_dpch_fwd_launch<scalar_t>(
batch_tokens.data_ptr<scalar_t>(), res.data_ptr<scalar_t>(),
@ -586,7 +585,7 @@ torch::Tensor moe_dispatch_cuda_backward(int s, int ec, int h,
{s, h}, torch::dtype(expert_grad.dtype()).device(expert_grad.device()));
auto k = mask.size(0);
DISPATCH_FLOAT_AND_HALF(
DISPATCH_FLOAT_AND_HALF_MOE(
expert_grad.scalar_type(), "moe dispatch backward",
moe_dpch_bwd_launch<scalar_t>(
res.data_ptr<scalar_t>(), expert_grad.data_ptr<scalar_t>(),
@ -609,7 +608,7 @@ torch::Tensor moe_combine_cuda_forward(int s, int e, int c, int h,
torch::dtype(expert_tokens.dtype()).device(expert_tokens.device()));
auto k = mask.size(0);
DISPATCH_FLOAT_AND_HALF(
DISPATCH_FLOAT_AND_HALF_MOE(
expert_tokens.scalar_type(), "moe combine forward",
moe_cb_fwd_launch<scalar_t>(
expert_tokens.data_ptr<scalar_t>(), res.data_ptr<scalar_t>(),
@ -636,7 +635,7 @@ std::vector<torch::Tensor> moe_combine_cuda_backward(
{s, e}, torch::dtype(logits.dtype()).device(logits.device()));
auto k = mask.size(0);
DISPATCH_FLOAT_AND_HALF(
DISPATCH_FLOAT_AND_HALF_MOE(
tokens_grad.scalar_type(), "moe combine backward",
moe_cb_bwd_launch<scalar_t>(
tokens_grad.data_ptr<scalar_t>(), egrad.data_ptr<scalar_t>(),

2
extensions/csrc/cuda/multi_tensor_adam_kernel.cu → extensions/csrc/kernel/cuda/multi_tensor_adam_kernel.cu

@ -15,7 +15,7 @@
#include <assert.h>
#include "multi_tensor_apply.cuh"
#include "../common/micros.h"
#include "common/micros.h"
#define BLOCK_SIZE 512
#define ILP 4

2
extensions/csrc/cuda/multi_tensor_apply.cuh → extensions/csrc/kernel/cuda/multi_tensor_apply.cuh

@ -12,7 +12,7 @@
#include <assert.h>
#include <c10/cuda/CUDAGuard.h>
#include "../common/micros.h"
#include "common/micros.h"
// #include <iostream>

3
extensions/csrc/cuda/multi_tensor_l2norm_kernel.cu → extensions/csrc/kernel/cuda/multi_tensor_l2norm_kernel.cu

@ -11,8 +11,7 @@
#include <assert.h>
#include "multi_tensor_apply.cuh"
#include "../common/micros.h"
#include "funcs/reduce_function.h"
#include "common/micros.h"
#define BLOCK_SIZE 512
#define ILP 4

2
extensions/csrc/cuda/multi_tensor_lamb_kernel.cu → extensions/csrc/kernel/cuda/multi_tensor_lamb_kernel.cu

@ -10,7 +10,7 @@
#include <assert.h>
#include "multi_tensor_apply.cuh"
#include "../common/micros.h"
#include "common/micros.h"
#define BLOCK_SIZE 512
#define ILP 4

2
extensions/csrc/cuda/multi_tensor_scale_kernel.cu → extensions/csrc/kernel/cuda/multi_tensor_scale_kernel.cu

@ -10,7 +10,7 @@
#include <sstream>
#include "multi_tensor_apply.cuh"
#include "../common/micros.h"
#include "common/micros.h"
#define BLOCK_SIZE 512
#define ILP 4

2
extensions/csrc/cuda/multi_tensor_sgd_kernel.cu → extensions/csrc/kernel/cuda/multi_tensor_sgd_kernel.cu

@ -7,7 +7,7 @@
#include <assert.h>
#include <cuda_runtime.h>
#include "../common/micros.h"
#include "common/micros.h"
#include "multi_tensor_apply.cuh"
#define BLOCK_SIZE 512

18
extensions/csrc/cuda/rms_layernorm_kernel.cu → extensions/csrc/kernel/cuda/rms_layernorm_kernel.cu

@ -7,18 +7,18 @@
#include <c10/cuda/CUDAGuard.h>
#include "../common/micros.h"
#include "common/micros.h"
#include "funcs/cast_functor.h"
#include "funcs/binary_functor.h"
#include "funcs/reduce_function.h"
#include "utils/vec_type_traits.h"
using colossalAI::cuda::funcs::block_reduce;
using colossalAI::cuda::funcs::ReduceType;
using colossalAI::cuda::funcs::CastFunctor;
using colossalAI::cuda::funcs::BinaryOpFunctor;
using colossalAI::cuda::funcs::BinaryOpType;
using colossalAI::cuda::utils::VecTypeTrait;
#include "common/vec_type_traits.h"
using colossalAI::funcs::block_reduce;
using colossalAI::funcs::ReduceType;
using colossalAI::funcs::CastFunctor;
using colossalAI::funcs::BinaryOpFunctor;
using colossalAI::funcs::BinaryOpType;
using colossalAI::common::VecTypeTrait;
#define RMSNORM_LAUNCHER(UNROLL_FACTOR, THREADDIM) \
DISPATCH_RMSNORM_FLOAT_HALF_AND_BFLOAT( \

10
extensions/csrc/cuda/scaled_masked_softmax_kernel.cu → extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu

@ -14,15 +14,15 @@
#include <cfloat>
#include <limits>
#include "../common/micros.h"
#include "common/micros.h"
#include "utils/vec_copy.h"
#include "funcs/reduce_function.h"
#include "funcs/unary_functor.h"
using colossalAI::cuda::funcs::UnaryOpFunctor;
using colossalAI::cuda::funcs::UnaryOpType;
using colossalAI::cuda::funcs::warp_reduce;
using colossalAI::cuda::funcs::ReduceType;
using colossalAI::funcs::UnaryOpFunctor;
using colossalAI::funcs::UnaryOpType;
using colossalAI::funcs::warp_reduce;
using colossalAI::funcs::ReduceType;
using colossalAI::cuda::utils::copy_vector;

10
extensions/csrc/cuda/scaled_upper_triang_masked_softmax_kernel.cu → extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu

@ -14,15 +14,15 @@
#include <cfloat>
#include <limits>
#include "../common/micros.h"
#include "common/micros.h"
#include "utils/vec_copy.h"
#include "funcs/reduce_function.h"
#include "funcs/unary_functor.h"
using colossalAI::cuda::funcs::UnaryOpFunctor;
using colossalAI::cuda::funcs::UnaryOpType;
using colossalAI::cuda::funcs::warp_reduce;
using colossalAI::cuda::funcs::ReduceType;
using colossalAI::funcs::UnaryOpFunctor;
using colossalAI::funcs::UnaryOpType;
using colossalAI::funcs::warp_reduce;
using colossalAI::funcs::ReduceType;
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::copy_zero_vector;

0
extensions/csrc/cuda/utils/gpu_launch_config.h → extensions/csrc/kernel/cuda/utils/gpu_launch_config.h

0
extensions/csrc/cuda/utils/micros.h → extensions/csrc/kernel/cuda/utils/micros.h

0
extensions/csrc/cuda/utils/nvgpu_dev_info.h → extensions/csrc/kernel/cuda/utils/nvgpu_dev_info.h

11
extensions/csrc/cuda/utils/vec_copy.h → extensions/csrc/kernel/cuda/utils/vec_copy.h

@ -4,8 +4,8 @@
#include <cuda_fp16.h>
#include <stdint.h>
#include "../funcs/cast_functor.h"
#include "vec_type_traits.h"
#include "common/vec_type_traits.h"
#include "funcs/cast_functor.h"
namespace colossalAI {
namespace cuda {
@ -13,7 +13,7 @@ namespace utils {
template <typename T, int VecSize>
__device__ __inline__ void copy_vector(T *dst, const T *src) {
using VT = typename colossalAI::cuda::utils::VecTypeTrait<T, VecSize>::Type;
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
// Note(LiuYang): Here static_cast can't be used for cast between two pointer
*(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
}
@ -29,9 +29,8 @@ __device__ __inline__ void copy_vector<float, 8>(float *dst, const float *src) {
template <typename T, int VecSize>
__device__ __inline__ void copy_zero_vector(T *dst) {
using VT = typename colossalAI::cuda::utils::VecTypeTrait<T, VecSize>::Type;
*(reinterpret_cast<VT *>(dst)) =
colossalAI::cuda::funcs::CastFunctor<float, VT>()(0.0f);
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
*(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
}
template <typename T>

0
extensions/csrc/x86/cpu_adam.cpp → extensions/csrc/kernel/x86/cpu_adam.cpp

0
extensions/csrc/x86/cpu_adam.h → extensions/csrc/kernel/x86/cpu_adam.h

7
extensions/cuda_extension.py

@ -21,6 +21,7 @@ class _CudaExtension(_CppExtension):
"""
This function should return a list of nvcc compilation flags for extensions.
"""
return ["-DCOLOSSAL_WITH_CUDA"]
def is_available(self) -> bool:
# cuda extension can only be built if cuda is available
@ -53,6 +54,12 @@ class _CudaExtension(_CppExtension):
cuda_include = os.path.join(CUDA_HOME, "include")
return cuda_include
def include_dirs(self) -> List[str]:
"""
This function should return a list of include files for extensions.
"""
return super().include_dirs() + [self.get_cuda_home_include()]
def build_jit(self) -> None:
from torch.utils.cpp_extension import CUDA_HOME, load

36
extensions/inference/inference_ops_cuda.py

@ -1,36 +0,0 @@
from ..cuda_extension import _CudaExtension
from ..utils import get_cuda_cc_flag
class InferenceOpsCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(name="inference_ops_cuda")
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"cuda/pybind/inference.cpp",
"cuda/decode_kv_cache_memcpy_kernel.cu",
"cuda/context_kv_cache_memcpy_kernel.cu",
"cuda/fused_rotary_emb_and_cache_kernel.cu",
"cuda/activation_kernel.cu",
"cuda/rms_layernorm_kernel.cu",
"cuda/get_cos_and_sin_kernel.cu",
"cuda/flash_decoding_attention_kernel.cu",
]
]
return ret
def include_dirs(self):
ret = [self.csrc_abs_path("cuda/include"), self.get_cuda_home_include()]
return ret
def cxx_flags(self):
version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
return ["-O3"] + version_dependent_macros
def nvcc_flags(self):
extra_cuda_flags = ["-lineinfo"]
extra_cuda_flags.extend(get_cuda_cc_flag())
return ["-O3", "--use_fast_math"] + extra_cuda_flags

0
extensions/pybind/__init__.py

0
extensions/cpu_adam/__init__.py → extensions/pybind/cpu_adam/__init__.py

9
extensions/cpu_adam/cpu_adam_arm.py → extensions/pybind/cpu_adam/cpu_adam_arm.py

@ -1,6 +1,7 @@
import platform
from typing import List
from ..cpp_extension import _CppExtension
from ...cpp_extension import _CppExtension
class CpuAdamArmExtension(_CppExtension):
@ -20,12 +21,12 @@ class CpuAdamArmExtension(_CppExtension):
# necessary 4 functions
def sources_files(self):
ret = [
self.csrc_abs_path("arm/cpu_adam_arm.cpp"),
self.csrc_abs_path("kernel/arm/cpu_adam_arm.cpp"),
]
return ret
def include_dirs(self):
return []
def include_dirs(self) -> List[str]:
return super().include_dirs()
def cxx_flags(self):
extra_cxx_flags = [

11
extensions/cpu_adam/cpu_adam_x86.py → extensions/pybind/cpu_adam/cpu_adam_x86.py

@ -1,7 +1,7 @@
import platform
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads
from ...cuda_extension import _CudaExtension
from ...utils import append_nvcc_threads
class CpuAdamX86Extension(_CudaExtension):
@ -21,13 +21,10 @@ class CpuAdamX86Extension(_CudaExtension):
# necessary 4 functions
def sources_files(self):
ret = [
self.csrc_abs_path("x86/cpu_adam.cpp"),
self.csrc_abs_path("kernel/x86/cpu_adam.cpp"),
]
return ret
def include_dirs(self):
return [self.csrc_abs_path("includes"), self.get_cuda_home_include()]
def cxx_flags(self):
extra_cxx_flags = [
"-std=c++14",
@ -50,5 +47,5 @@ class CpuAdamX86Extension(_CudaExtension):
"-U__CUDA_NO_HALF2_OPERATORS__",
"-DTHRUST_IGNORE_CUB_VERSION_CHECK",
]
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags + super().nvcc_flags()
return append_nvcc_threads(ret)

0
extensions/flash_attention/__init__.py → extensions/pybind/flash_attention/__init__.py

2
extensions/flash_attention/flash_attention_dao_cuda.py → extensions/pybind/flash_attention/flash_attention_dao_cuda.py

@ -1,4 +1,4 @@
from ..base_extension import _Extension
from ...base_extension import _Extension
class FlashAttentionDaoCudaExtension(_Extension):

2
extensions/flash_attention/flash_attention_npu.py → extensions/pybind/flash_attention/flash_attention_npu.py

@ -1,4 +1,4 @@
from ..base_extension import _Extension
from ...base_extension import _Extension
class FlashAttentionNpuExtension(_Extension):

2
extensions/flash_attention/flash_attention_sdpa_cuda.py → extensions/pybind/flash_attention/flash_attention_sdpa_cuda.py

@ -1,4 +1,4 @@
from ..base_extension import _Extension
from ...base_extension import _Extension
class FlashAttentionSdpaCudaExtension(_Extension):

0
extensions/inference/__init__.py → extensions/pybind/inference/__init__.py

0
extensions/csrc/cuda/pybind/inference.cpp → extensions/pybind/inference/inference.cpp

31
extensions/pybind/inference/inference_ops_cuda.py

@ -0,0 +1,31 @@
from ...cuda_extension import _CudaExtension
from ...utils import get_cuda_cc_flag
class InferenceOpsCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(name="inference_ops_cuda")
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"kernel/cuda/decode_kv_cache_memcpy_kernel.cu",
"kernel/cuda/context_kv_cache_memcpy_kernel.cu",
"kernel/cuda/fused_rotary_emb_and_cache_kernel.cu",
"kernel/cuda/activation_kernel.cu",
"kernel/cuda/rms_layernorm_kernel.cu",
"kernel/cuda/get_cos_and_sin_kernel.cu",
"kernel/cuda/flash_decoding_attention_kernel.cu",
]
] + [self.pybind_abs_path("inference/inference.cpp")]
return ret
def cxx_flags(self):
version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]
return ["-O3"] + version_dependent_macros
def nvcc_flags(self):
extra_cuda_flags = ["-lineinfo"]
extra_cuda_flags.extend(get_cuda_cc_flag())
return ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags()

0
extensions/layernorm/__init__.py → extensions/pybind/layernorm/__init__.py

2
extensions/csrc/cuda/pybind/layer_norm.cpp → extensions/pybind/layernorm/layer_norm.cpp

@ -7,7 +7,7 @@
#include <cassert>
#include <vector>
#include "../../common/micros.h"
#include "common/micros.h"
namespace {

12
extensions/layernorm/layernorm_cuda.py → extensions/pybind/layernorm/layernorm_cuda.py

@ -1,5 +1,5 @@
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads, get_cuda_cc_flag
from ...cuda_extension import _CudaExtension
from ...utils import append_nvcc_threads, get_cuda_cc_flag
class LayerNormCudaExtension(_CudaExtension):
@ -7,11 +7,13 @@ class LayerNormCudaExtension(_CudaExtension):
super().__init__(name="layernorm_cuda")
def sources_files(self):
ret = [self.csrc_abs_path(fname) for fname in ["cuda/pybind/layer_norm.cpp", "cuda/layer_norm_kernel.cu"]]
ret = [self.csrc_abs_path(fname) for fname in ["kernel/cuda/layer_norm_kernel.cu"]] + [
self.pybind_abs_path("layernorm/layer_norm.cpp")
]
return ret
def include_dirs(self):
ret = [self.get_cuda_home_include()]
ret = [self.get_cuda_home_include()] + [self.csrc_abs_path("")]
return ret
def cxx_flags(self):
@ -20,5 +22,5 @@ class LayerNormCudaExtension(_CudaExtension):
def nvcc_flags(self):
extra_cuda_flags = ["-maxrregcount=50"]
extra_cuda_flags.extend(get_cuda_cc_flag())
ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + self.version_dependent_macros
ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + self.version_dependent_macros + super().nvcc_flags()
return append_nvcc_threads(ret)

0
extensions/moe/__init__.py → extensions/pybind/moe/__init__.py

0
extensions/csrc/cuda/pybind/moe.cpp → extensions/pybind/moe/moe.cpp

14
extensions/moe/moe_cuda.py → extensions/pybind/moe/moe_cuda.py

@ -1,17 +1,15 @@
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads, get_cuda_cc_flag
from ...cuda_extension import _CudaExtension
from ...utils import append_nvcc_threads, get_cuda_cc_flag
class MoeCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(name="moe_cuda")
def include_dirs(self):
ret = [self.csrc_abs_path("cuda/include"), self.get_cuda_home_include()]
return ret
def sources_files(self):
ret = [self.csrc_abs_path(fname) for fname in ["cuda/pybind/moe.cpp", "cuda/moe_kernel.cu"]]
ret = [self.csrc_abs_path(fname) for fname in ["kernel/cuda/moe_kernel.cu"]] + [
self.pybind_abs_path("moe/moe.cpp")
]
return ret
def cxx_flags(self):
@ -25,5 +23,5 @@ class MoeCudaExtension(_CudaExtension):
"--expt-extended-lambda",
]
extra_cuda_flags.extend(get_cuda_cc_flag())
ret = ["-O3", "--use_fast_math"] + extra_cuda_flags
ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags()
return append_nvcc_threads(ret)

0
extensions/optimizer/__init__.py → extensions/pybind/optimizer/__init__.py

23
extensions/optimizer/fused_optimizer_cuda.py → extensions/pybind/optimizer/fused_optimizer_cuda.py

@ -1,5 +1,5 @@
from ..cuda_extension import _CudaExtension
from ..utils import get_cuda_cc_flag
from ...cuda_extension import _CudaExtension
from ...utils import get_cuda_cc_flag
class FusedOptimizerCudaExtension(_CudaExtension):
@ -10,18 +10,13 @@ class FusedOptimizerCudaExtension(_CudaExtension):
ret = [
self.csrc_abs_path(fname)
for fname in [
"cuda/pybind/optimizer.cpp",
"cuda/multi_tensor_sgd_kernel.cu",
"cuda/multi_tensor_scale_kernel.cu",
"cuda/multi_tensor_adam_kernel.cu",
"cuda/multi_tensor_l2norm_kernel.cu",
"cuda/multi_tensor_lamb_kernel.cu",
"kernel/cuda/multi_tensor_sgd_kernel.cu",
"kernel/cuda/multi_tensor_scale_kernel.cu",
"kernel/cuda/multi_tensor_adam_kernel.cu",
"kernel/cuda/multi_tensor_l2norm_kernel.cu",
"kernel/cuda/multi_tensor_lamb_kernel.cu",
]
]
return ret
def include_dirs(self):
ret = [self.get_cuda_home_include()]
] + [self.pybind_abs_path("optimizer/optimizer.cpp")]
return ret
def cxx_flags(self):
@ -31,4 +26,4 @@ class FusedOptimizerCudaExtension(_CudaExtension):
def nvcc_flags(self):
extra_cuda_flags = ["-lineinfo"]
extra_cuda_flags.extend(get_cuda_cc_flag())
return ["-O3", "--use_fast_math"] + extra_cuda_flags
return ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags()

0
extensions/csrc/cuda/pybind/optimizer.cpp → extensions/pybind/optimizer/optimizer.cpp

0
extensions/softmax/__init__.py → extensions/pybind/softmax/__init__.py

0
extensions/csrc/cuda/pybind/scaled_masked_softmax.cpp → extensions/pybind/softmax/scaled_masked_softmax.cpp

14
extensions/softmax/scaled_masked_softmax_cuda.py → extensions/pybind/softmax/scaled_masked_softmax_cuda.py

@ -1,5 +1,5 @@
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads
from ...cuda_extension import _CudaExtension
from ...utils import append_nvcc_threads
class ScaledMaskedSoftmaxCudaExtension(_CudaExtension):
@ -7,15 +7,11 @@ class ScaledMaskedSoftmaxCudaExtension(_CudaExtension):
super().__init__(name="scaled_masked_softmax_cuda")
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in ["cuda/pybind/scaled_masked_softmax.cpp", "cuda/scaled_masked_softmax_kernel.cu"]
ret = [self.csrc_abs_path(fname) for fname in ["kernel/cuda/scaled_masked_softmax_kernel.cu"]] + [
self.pybind_abs_path("softmax/scaled_masked_softmax.cpp")
]
return ret
def include_dirs(self):
return [self.get_cuda_home_include()]
def cxx_flags(self):
return ["-O3"] + self.version_dependent_macros
@ -28,5 +24,5 @@ class ScaledMaskedSoftmaxCudaExtension(_CudaExtension):
"-U__CUDA_NO_HALF2_OPERATORS__",
"-DTHRUST_IGNORE_CUB_VERSION_CHECK",
]
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags
ret = ["-O3", "--use_fast_math"] + self.version_dependent_macros + extra_cuda_flags + super().nvcc_flags()
return append_nvcc_threads(ret)

0
extensions/csrc/cuda/pybind/scaled_upper_triang_masked_softmax.cpp → extensions/pybind/softmax/scaled_upper_triang_masked_softmax.cpp

14
extensions/softmax/scaled_upper_triangle_masked_softmax_cuda.py → extensions/pybind/softmax/scaled_upper_triangle_masked_softmax_cuda.py

@ -1,22 +1,18 @@
from ..cuda_extension import _CudaExtension
from ..utils import append_nvcc_threads, get_cuda_cc_flag
from ...cuda_extension import _CudaExtension
from ...utils import append_nvcc_threads, get_cuda_cc_flag
class ScaledUpperTriangleMaskedSoftmaxCudaExtension(_CudaExtension):
def __init__(self):
super().__init__(name="scaled_upper_triangle_masked_softmax_cuda")
def include_dirs(self):
return [self.get_cuda_home_include()]
def sources_files(self):
ret = [
self.csrc_abs_path(fname)
for fname in [
"cuda/pybind/scaled_upper_triang_masked_softmax.cpp",
"cuda/scaled_upper_triang_masked_softmax_kernel.cu",
"kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu",
]
]
] + [self.pybind_abs_path("softmax/scaled_upper_triang_masked_softmax.cpp")]
return ret
def cxx_flags(self):
@ -30,5 +26,5 @@ class ScaledUpperTriangleMaskedSoftmaxCudaExtension(_CudaExtension):
"--expt-extended-lambda",
]
extra_cuda_flags.extend(get_cuda_cc_flag())
ret = ["-O3", "--use_fast_math"] + extra_cuda_flags
ret = ["-O3", "--use_fast_math"] + extra_cuda_flags + super().nvcc_flags()
return append_nvcc_threads(ret)
Loading…
Cancel
Save