diff --git a/colossalai/kernel/__init__.py b/colossalai/kernel/__init__.py index 113ec79da..1e48019c9 100644 --- a/colossalai/kernel/__init__.py +++ b/colossalai/kernel/__init__.py @@ -12,4 +12,12 @@ except ImportError: from colossalai.kernel.op_builder import CPUAdamBuilder cpu_optim = CPUAdamBuilder().load() -__all__ = ["fused_optim", "cpu_optim", "LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention"] +try: + from colossalai._C import multihead_attention +except ImportError: + from colossalai.kernel.op_builder import MultiHeadAttnBuilder + multihead_attention = MultiHeadAttnBuilder().load() + +__all__ = [ + "fused_optim", "cpu_optim", "multihead_attention", "LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention" +] diff --git a/colossalai/kernel/cuda_native/multihead_attention.py b/colossalai/kernel/cuda_native/multihead_attention.py index 84cae529a..2c7503453 100644 --- a/colossalai/kernel/cuda_native/multihead_attention.py +++ b/colossalai/kernel/cuda_native/multihead_attention.py @@ -135,11 +135,8 @@ class MultiHeadAttention(nn.Module): # Load cuda modules if needed global colossal_multihead_attention if colossal_multihead_attention is None: - try: - import colossalai._C.multihead_attention - colossal_multihead_attention = colossalai._C.multihead_attention - except ImportError: - raise RuntimeError('MultiHeadAttention requires cuda extensions') + from colossalai.kernel import multihead_attention + colossal_multihead_attention = multihead_attention # create the layer in cuda kernels. cuda_module = colossal_multihead_attention diff --git a/colossalai/kernel/op_builder/__init__.py b/colossalai/kernel/op_builder/__init__.py index 6cc3e6358..654f595a0 100644 --- a/colossalai/kernel/op_builder/__init__.py +++ b/colossalai/kernel/op_builder/__init__.py @@ -1,4 +1,5 @@ from .cpu_adam import CPUAdamBuilder from .fused_optim import FusedOptimBuilder +from .multi_head_attn import MultiHeadAttnBuilder -__all__ = ['CPUAdamBuilder', 'FusedOptimBuilder'] +__all__ = ['CPUAdamBuilder', 'FusedOptimBuilder', 'MultiHeadAttnBuilder'] diff --git a/colossalai/kernel/op_builder/builder.py b/colossalai/kernel/op_builder/builder.py index 36f27d348..bb8996217 100644 --- a/colossalai/kernel/op_builder/builder.py +++ b/colossalai/kernel/op_builder/builder.py @@ -1,7 +1,26 @@ import os +import re import sys from pathlib import Path +import torch + + +def get_cuda_cc_flag(): + """get_cuda_cc_flag + + cc flag for your GPU arch + """ + cc_flag = [] + for arch in torch.cuda.get_arch_list(): + res = re.search(r'sm_(\d+)', arch) + if res: + arch_cap = res[1] + if int(arch_cap) >= 60: + cc_flag.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}']) + + return cc_flag + class Builder(object): diff --git a/colossalai/kernel/op_builder/fused_optim.py b/colossalai/kernel/op_builder/fused_optim.py index cbf76be82..fc97caaa0 100644 --- a/colossalai/kernel/op_builder/fused_optim.py +++ b/colossalai/kernel/op_builder/fused_optim.py @@ -3,7 +3,7 @@ import re import torch -from .builder import Builder +from .builder import Builder, get_cuda_cc_flag class FusedOptimBuilder(Builder): @@ -16,12 +16,7 @@ class FusedOptimBuilder(Builder): self.extra_cxx_flags = [] self.extra_cuda_flags = ['-lineinfo'] - for arch in torch.cuda.get_arch_list(): - res = re.search(r'sm_(\d+)', arch) - if res: - arch_cap = res[1] - if int(arch_cap) >= 60: - self.extra_cuda_flags.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}']) + self.extra_cuda_flags.extend(get_cuda_cc_flag()) self.sources = [self.colossalai_src_path(path) for path in self.sources_files()] self.extra_include_paths = [self.colossalai_src_path(path) for path in self.include_paths()] diff --git a/colossalai/kernel/op_builder/multi_head_attn.py b/colossalai/kernel/op_builder/multi_head_attn.py new file mode 100644 index 000000000..43a5dc6be --- /dev/null +++ b/colossalai/kernel/op_builder/multi_head_attn.py @@ -0,0 +1,51 @@ +import os + +from .builder import Builder, get_cuda_cc_flag + + +class MultiHeadAttnBuilder(Builder): + + def __init__(self): + self.base_dir = "cuda_native/csrc" + self.name = 'multihead_attention' + super().__init__() + self.extra_cxx_flags = [] + self.extra_cuda_flags = [ + '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', + '-U__CUDA_NO_HALF2_OPERATORS__', '-DTHRUST_IGNORE_CUB_VERSION_CHECK' + ] + + self.extra_cuda_flags.extend(get_cuda_cc_flag()) + self.sources = [self.colossalai_src_path(path) for path in self.sources_files()] + self.extra_include_paths = [self.colossalai_src_path(path) for path in self.include_paths()] + + self.version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] + + def sources_files(self): + return [ + os.path.join(self.base_dir, fname) for fname in [ + 'multihead_attention_1d.cpp', 'kernels/cublas_wrappers.cu', 'kernels/transform_kernels.cu', + 'kernels/dropout_kernels.cu', 'kernels/normalize_kernels.cu', 'kernels/softmax_kernels.cu', + 'kernels/general_kernels.cu', 'kernels/cuda_util.cu' + ] + ] + + def include_paths(self): + from torch.utils.cpp_extension import CUDA_HOME + ret = [] + cuda_include = os.path.join(CUDA_HOME, "include") + ret = [os.path.join(self.base_dir, "includes"), cuda_include] + ret.append(os.path.join(self.base_dir, "kernels", "include")) + print("include_paths", ret) + return ret + + def builder(self, name): + from torch.utils.cpp_extension import CUDAExtension + return CUDAExtension( + name=name, + sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in self.sources], + include_dirs=self.extra_include_paths, + extra_compile_args={ + 'cxx': ['-O3'] + self.version_dependent_macros, + 'nvcc': ['-O3', '--use_fast_math'] + self.extra_cuda_flags + }) diff --git a/setup.py b/setup.py index 57a2a046f..ba6f5a7d4 100644 --- a/setup.py +++ b/setup.py @@ -172,17 +172,9 @@ if build_cuda_ext: cuda_ext_helper('colossalai._C.layer_norm', ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'], extra_cuda_flags + cc_flag)) - extra_cuda_flags = [ - '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', - '-DTHRUST_IGNORE_CUB_VERSION_CHECK' - ] - - ext_modules.append( - cuda_ext_helper('colossalai._C.multihead_attention', [ - 'multihead_attention_1d.cpp', 'kernels/cublas_wrappers.cu', 'kernels/transform_kernels.cu', - 'kernels/dropout_kernels.cu', 'kernels/normalize_kernels.cu', 'kernels/softmax_kernels.cu', - 'kernels/general_kernels.cu', 'kernels/cuda_util.cu' - ], extra_cuda_flags + cc_flag)) + ### MultiHeadAttn Kernel #### + from colossalai.kernel.op_builder import MultiHeadAttnBuilder + ext_modules.append(MultiHeadAttnBuilder().builder('colossalai._C.multihead_attention')) ### Gemini Adam kernel #### from colossalai.kernel.op_builder import CPUAdamBuilder