mirror of https://github.com/hpcaitech/ColossalAI
115 lines
4.3 KiB
Python
115 lines
4.3 KiB
Python
import os
|
|
import pathlib
|
|
import subprocess
|
|
|
|
from torch.utils import cpp_extension
|
|
|
|
# Setting this param to a list has a problem of generating different
|
|
# compilation commands (with diferent order of architectures) and
|
|
# leading to recompilation of fused kernels. Set it to empty string
|
|
# to avoid recompilation and assign arch flags explicity in
|
|
# extra_cuda_cflags below
|
|
os.environ["TORCH_CUDA_ARCH_LIST"] = ""
|
|
|
|
|
|
def _build_cuda_native_kernel():
|
|
|
|
# Check if cuda 11 is installed for compute capability 8.0
|
|
cc_flag = []
|
|
_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
|
|
if int(bare_metal_major) >= 11:
|
|
cc_flag.append('-gencode')
|
|
cc_flag.append('arch=compute_80,code=sm_80')
|
|
|
|
# Build path
|
|
basepath = pathlib.Path(__file__).parent.absolute()
|
|
srcpath = basepath / 'csrc'
|
|
buildpath = basepath / 'build'
|
|
_create_build_dir(buildpath)
|
|
|
|
# Helper function to build the kernels.
|
|
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
|
|
return cpp_extension.load(
|
|
name=name,
|
|
sources=sources,
|
|
build_directory=buildpath,
|
|
extra_cflags=[
|
|
'-O3',
|
|
],
|
|
extra_include_paths=[str(srcpath / 'kernels' / 'include')],
|
|
extra_cuda_cflags=['-O3', '-gencode', 'arch=compute_70,code=sm_70', '--use_fast_math'] +
|
|
extra_cuda_flags + cc_flag,
|
|
verbose=False)
|
|
|
|
# ==============
|
|
# Fused softmax.
|
|
# ==============
|
|
|
|
extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
|
|
'-U__CUDA_NO_HALF_CONVERSIONS__',
|
|
'--expt-relaxed-constexpr',
|
|
'--expt-extended-lambda']
|
|
|
|
# Upper triangular softmax.
|
|
sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
|
|
srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
|
|
colossal_scaled_upper_triang_masked_softmax = _cpp_extention_load_helper(
|
|
"colossal_scaled_upper_triang_masked_softmax",
|
|
sources, extra_cuda_flags)
|
|
|
|
# Masked softmax.
|
|
sources=[srcpath / 'scaled_masked_softmax.cpp',
|
|
srcpath / 'scaled_masked_softmax_cuda.cu']
|
|
colossal_scaled_masked_softmax = _cpp_extention_load_helper(
|
|
"colossal_scaled_masked_softmax", sources, extra_cuda_flags)
|
|
|
|
# =================================
|
|
# Mixed precision fused layer norm.
|
|
# =================================
|
|
|
|
extra_cuda_flags = ['-maxrregcount=50']
|
|
sources = [srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu']
|
|
colossal_layer_norm_cuda = _cpp_extention_load_helper("colossal_layer_norm_cuda", sources,
|
|
extra_cuda_flags)
|
|
|
|
# ==========================================
|
|
# Mixed precision Transformer Encoder Layer.
|
|
# ==========================================
|
|
|
|
extra_cuda_flags = ['-std=c++14',
|
|
'-U__CUDA_NO_HALF_OPERATORS__',
|
|
'-U__CUDA_NO_HALF_CONVERSIONS__',
|
|
'-U__CUDA_NO_HALF2_OPERATORS__',
|
|
'-DTHRUST_IGNORE_CUB_VERSION_CHECK']
|
|
|
|
sources = [srcpath / 'multihead_attention_1d.cpp']
|
|
kernel_sources = ["cublas_wrappers.cu",
|
|
"transform_kernels.cu",
|
|
"dropout_kernels.cu",
|
|
"normalize_kernels.cu",
|
|
"softmax_kernels.cu",
|
|
"general_kernels.cu",
|
|
"cuda_util.cu"]
|
|
sources += [(srcpath / 'kernels' / cu_file) for cu_file in kernel_sources]
|
|
colossal_multihead_attention = _cpp_extention_load_helper("colossal_multihead_attention", sources,
|
|
extra_cuda_flags)
|
|
|
|
|
|
def _get_cuda_bare_metal_version(cuda_dir):
|
|
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
|
|
output = raw_output.split()
|
|
release_idx = output.index("release") + 1
|
|
release = output[release_idx].split(".")
|
|
bare_metal_major = release[0]
|
|
bare_metal_minor = release[1][0]
|
|
|
|
return raw_output, bare_metal_major, bare_metal_minor
|
|
|
|
|
|
def _create_build_dir(buildpath):
|
|
try:
|
|
os.mkdir(buildpath)
|
|
except OSError:
|
|
if not os.path.isdir(buildpath):
|
|
print(f"Creation of the build directory {buildpath} failed")
|