diff --git a/setup.py b/setup.py index c3913302f..748361154 100644 --- a/setup.py +++ b/setup.py @@ -138,11 +138,7 @@ if build_cuda_ext: 'nvcc': append_nvcc_threads(['-O3', '--use_fast_math'] + version_dependent_macros + extra_cuda_flags) }) - ext_modules.append( - cuda_ext_helper('colossal_C', [ - 'colossal_C_frontend.cpp', 'multi_tensor_sgd_kernel.cu', 'multi_tensor_scale_kernel.cu', - 'multi_tensor_adam.cu', 'multi_tensor_l2norm_kernel.cu', 'multi_tensor_lamb.cu' - ], ['-lineinfo'])) + cc_flag = ['-gencode', 'arch=compute_70,code=sm_70'] _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) @@ -150,6 +146,14 @@ if build_cuda_ext: cc_flag.append('-gencode') cc_flag.append('arch=compute_80,code=sm_80') + extra_cuda_flags = ['-lineinfo'] + + ext_modules.append( + cuda_ext_helper('colossal_C', [ + 'colossal_C_frontend.cpp', 'multi_tensor_sgd_kernel.cu', 'multi_tensor_scale_kernel.cu', + 'multi_tensor_adam.cu', 'multi_tensor_l2norm_kernel.cu', 'multi_tensor_lamb.cu' + ], extra_cuda_flags + cc_flag)) + extra_cuda_flags = [ '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', '--expt-extended-lambda'