# This code has been adapted from the DeepSpeed library.
# Copyright (c) Microsoft Corporation.

# Licensed under the MIT License.
import importlib
import os
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Optional, Union

from .utils import check_cuda_availability, check_system_pytorch_cuda_match, print_rank_0


class ExtensionBuilder(ABC):
    """
    Builder is the base class to build extensions for PyTorch.

    Args:
        name (str): the name of the kernel to be built
        prebuilt_import_path (str): the path where the extension is installed during pip install
    """

    ext_type: str = "cuda"

    def __init__(self, name: str, prebuilt_import_path: str):
        self.name = name
        self.prebuilt_import_path = prebuilt_import_path
        self.version_dependent_macros = ["-DVERSION_GE_1_1", "-DVERSION_GE_1_3", "-DVERSION_GE_1_5"]

        # we store the op as an attribute to avoid repeated building and loading
        self.cached_op_module = None

        assert prebuilt_import_path.startswith(
            "colossalai._C"
        ), f"The prebuilt_import_path should start with colossalai._C, but got {self.prebuilt_import_path}"

    def relative_to_abs_path(self, code_path: str) -> str:
        """
        This function takes in a path relative to the colossalai root directory and return the absolute path.
        """
        op_builder_module_path = Path(__file__).parent

        # if we install from source
        # the current file path will be op_builder/builder.py
        # if we install via pip install colossalai
        # the current file path will be colossalai/kernel/op_builder/builder.py
        # this is because that the op_builder inside colossalai is a symlink
        # this symlink will be replaced with actual files if we install via pypi
        # thus we cannot tell the colossalai root directory by checking whether the op_builder
        # is a symlink, we can only tell whether it is inside or outside colossalai
        if str(op_builder_module_path).endswith("colossalai/kernel/op_builder"):
            root_path = op_builder_module_path.parent.parent
        elif str(op_builder_module_path).endswith("colossalai/kernel/extensions"):
            root_path = op_builder_module_path.parent.parent
        else:
            root_path = op_builder_module_path.parent.joinpath("colossalai")

        code_abs_path = root_path.joinpath(code_path)
        return str(code_abs_path)

    def get_cuda_home_include(self):
        """
        return include path inside the cuda home.
        """
        from torch.utils.cpp_extension import CUDA_HOME

        if CUDA_HOME is None:
            raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.")
        cuda_include = os.path.join(CUDA_HOME, "include")
        return cuda_include

    def csrc_abs_path(self, path):
        return os.path.join(self.relative_to_abs_path("kernel/cuda_native/csrc"), path)

    # functions must be overrided begin
    @abstractmethod
    def sources_files(self) -> List[str]:
        """
        This function should return a list of source files for extensions.
        """
        raise NotImplementedError

    @abstractmethod
    def include_dirs(self) -> List[str]:
        """
        This function should return a list of include files for extensions.
        """

    @abstractmethod
    def cxx_flags(self) -> List[str]:
        """
        This function should return a list of cxx compilation flags for extensions.
        """

    @abstractmethod
    def nvcc_flags(self) -> List[str]:
        """
        This function should return a list of nvcc compilation flags for extensions.
        """

    # functions must be overrided over
    def strip_empty_entries(self, args):
        """
        Drop any empty strings from the list of compile and link flags
        """
        return [x for x in args if len(x) > 0]

    def import_op(self):
        """
        This function will import the op module by its string name.
        """
        return importlib.import_module(self.prebuilt_import_path)

    def check_runtime_build_environment(self):
        """
        Check whether the system environment is ready for extension compilation.
        """
        try:
            from torch.utils.cpp_extension import CUDA_HOME

            TORCH_AVAILABLE = True
        except ImportError:
            TORCH_AVAILABLE = False
            CUDA_HOME = None

        if not TORCH_AVAILABLE:
            raise ModuleNotFoundError(
                "PyTorch is not found. You need to install PyTorch first in order to build CUDA extensions"
            )

        if CUDA_HOME is None:
            raise RuntimeError(
                "CUDA_HOME is not found. You need to export CUDA_HOME environment variable or install CUDA Toolkit first in order to build CUDA extensions"
            )

        # make sure CUDA is available for compilation during
        cuda_available = check_cuda_availability()
        if not cuda_available:
            raise RuntimeError("CUDA is not available on your system as torch.cuda.is_available() returns False.")

        # make sure system CUDA and pytorch CUDA match, an error will raised inside the function if not
        check_system_pytorch_cuda_match(CUDA_HOME)

    def build(self, verbose: Optional[bool] = None):
        """
        If the kernel is not built during pip install, it will build the kernel.
        If the kernel is built during runtime, it will be stored in `~/.cache/colossalai/torch_extensions/`. If the
        kernel is built during pip install, it can be accessed through `colossalai._C`.

        Warning: do not load this kernel repeatedly during model execution as it could slow down the training process.

        Args:
            verbose (bool, optional): show detailed info. Defaults to True.
        """
        if verbose is None:
            verbose = os.environ.get("CAI_KERNEL_VERBOSE", "0") == "1"
        try:
            # if the kernel has been pre-built during installation
            # we just directly import it
            op_module = self.import_op()
            if verbose:
                print_rank_0(
                    f"[extension] OP {self.prebuilt_import_path} has been compiled ahead of time, skip building."
                )
        except ImportError:
            # check environment
            if self.ext_type == "cuda":
                self.check_runtime_build_environment()

            # time the kernel compilation
            start_build = time.time()

            # construct the build directory
            import torch
            from torch.utils.cpp_extension import load

            torch_version_major = torch.__version__.split(".")[0]
            torch_version_minor = torch.__version__.split(".")[1]
            torch_cuda_version = torch.version.cuda
            home_directory = os.path.expanduser("~")
            extension_directory = f".cache/colossalai/torch_extensions/torch{torch_version_major}.{torch_version_minor}_cu{torch_cuda_version}"
            build_directory = os.path.join(home_directory, extension_directory)
            Path(build_directory).mkdir(parents=True, exist_ok=True)

            if verbose:
                print_rank_0(f"[extension] Compiling or loading the JIT-built {self.name} kernel during runtime now")

            # load the kernel
            op_module = load(
                name=self.name,
                sources=self.strip_empty_entries(self.sources_files()),
                extra_include_paths=self.strip_empty_entries(self.include_dirs()),
                extra_cflags=self.cxx_flags(),
                extra_cuda_cflags=self.nvcc_flags(),
                extra_ldflags=[],
                build_directory=build_directory,
                verbose=verbose,
            )

            build_duration = time.time() - start_build

            # log jit compilation time
            if verbose:
                print_rank_0(f"[extension] Time to compile or load {self.name} op: {build_duration} seconds")

        # cache the built/loaded kernel
        self.cached_op_module = op_module

    def load(self, verbose: Optional[bool] = None):
        """
        load the kernel during runtime.

        Args:
            verbose (bool, optional): show detailed info. Defaults to True.
        """
        # if the kernel has be compiled and cached, we directly use it
        assert self.cached_op_module is not None, "Please build the kernel first before loading it."
        return self.cached_op_module

    def builder(self) -> Union["CUDAExtension", "CppExtension"]:
        """
        get a CUDAExtension instance used for setup.py
        """
        from torch.utils.cpp_extension import CppExtension, CUDAExtension

        if self.ext_type == "cpp":
            return CppExtension(
                name=self.prebuilt_import_path,
                sources=self.strip_empty_entries(self.sources_files()),
                include_dirs=self.strip_empty_entries(self.include_dirs()),
                extra_compile_args=self.strip_empty_entries(self.cxx_flags()),
            )

        return CUDAExtension(
            name=self.prebuilt_import_path,
            sources=self.strip_empty_entries(self.sources_files()),
            include_dirs=self.strip_empty_entries(self.include_dirs()),
            extra_compile_args={
                "cxx": self.strip_empty_entries(self.cxx_flags()),
                "nvcc": self.strip_empty_entries(self.nvcc_flags()),
            },
        )