import os import time from abc import abstractmethod from pathlib import Path from typing import List from .base_extension import _Extension from .cpp_extension import _CppExtension from .utils import check_pytorch_version, check_system_pytorch_cuda_match, set_cuda_arch_list __all__ = ["_CudaExtension"] # Some constants for installation checks MIN_PYTORCH_VERSION_MAJOR = 1 MIN_PYTORCH_VERSION_MINOR = 10 class _CudaExtension(_CppExtension): @abstractmethod def nvcc_flags(self) -> List[str]: """ This function should return a list of nvcc compilation flags for extensions. """ return ["-DCOLOSSAL_WITH_CUDA"] def is_available(self) -> bool: # cuda extension can only be built if cuda is available try: import torch # torch.cuda.is_available requires a device to exist, allow building with cuda extension on build nodes without a device # but where cuda is actually available. cuda_available = torch.cuda.is_available() or bool(os.environ.get("FORCE_CUDA", 0)) except: cuda_available = False return cuda_available def assert_compatible(self) -> None: from torch.utils.cpp_extension import CUDA_HOME if not CUDA_HOME: raise AssertionError( "[extension] CUDA_HOME is not found. You need to export CUDA_HOME environment variable or install CUDA Toolkit first in order to build/load CUDA extensions" ) check_system_pytorch_cuda_match(CUDA_HOME) check_pytorch_version(MIN_PYTORCH_VERSION_MAJOR, MIN_PYTORCH_VERSION_MINOR) def get_cuda_home_include(self): """ return include path inside the cuda home. """ from torch.utils.cpp_extension import CUDA_HOME if CUDA_HOME is None: raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.") cuda_include = os.path.join(CUDA_HOME, "include") return cuda_include def include_dirs(self) -> List[str]: """ This function should return a list of include files for extensions. """ return super().include_dirs() + [self.get_cuda_home_include()] def build_jit(self) -> None: from torch.utils.cpp_extension import CUDA_HOME, load set_cuda_arch_list(CUDA_HOME) # get build dir build_directory = _Extension.get_jit_extension_folder_path() build_directory = Path(build_directory) build_directory.mkdir(parents=True, exist_ok=True) # check if the kernel has been built compiled_before = False kernel_file_path = build_directory.joinpath(f"{self.name}.so") if kernel_file_path.exists(): compiled_before = True # load the kernel if compiled_before: print(f"[extension] Loading the JIT-built {self.name} kernel during runtime now") else: print(f"[extension] Compiling the JIT {self.name} kernel during runtime now") build_start = time.time() op_kernel = load( name=self.name, sources=self.strip_empty_entries(self.sources_files()), extra_include_paths=self.strip_empty_entries(self.include_dirs()), extra_cflags=self.cxx_flags(), extra_cuda_cflags=self.nvcc_flags(), extra_ldflags=[], build_directory=str(build_directory), ) build_duration = time.time() - build_start if compiled_before: print(f"[extension] Time taken to load {self.name} op: {build_duration} seconds") else: print(f"[extension] Time taken to compile {self.name} op: {build_duration} seconds") return op_kernel def build_aot(self) -> "CUDAExtension": from torch.utils.cpp_extension import CUDA_HOME, CUDAExtension set_cuda_arch_list(CUDA_HOME) return CUDAExtension( name=self.prebuilt_import_path, sources=self.strip_empty_entries(self.sources_files()), include_dirs=self.strip_empty_entries(self.include_dirs()), extra_compile_args={ "cxx": self.strip_empty_entries(self.cxx_flags()), "nvcc": self.strip_empty_entries(self.nvcc_flags()), }, )