ColossalAI/colossalai/kernel/kernel_loader.py

import warnings
from typing import List

from .extensions import (
    CpuAdamArmExtension,
    CpuAdamX86Extension,
    FlashAttentionDaoCudaExtension,
    FlashAttentionNpuExtension,
    FlashAttentionSdpaCudaExtension,
    FusedOptimizerCudaExtension,
    InferenceOpsCudaExtension,
    LayerNormCudaExtension,
    MoeCudaExtension,
    ScaledMaskedSoftmaxCudaExtension,
    ScaledUpperTriangleMaskedSoftmaxCudaExtension,
)
from .extensions.base_extension import _Extension

__all__ = [
    "KernelLoader",
    "CPUAdamLoader",
    "LayerNormLoader",
    "MoeLoader",
    "FusedOptimizerLoader",
    "InferenceOpsLoader",
    "ScaledMaskedSoftmaxLoader",
    "ScaledUpperTriangleMaskedSoftmaxLoader",
]


class KernelLoader:
    """
    An abstract class which offers encapsulation to the kernel loading process.

    Usage:
        kernel_loader = KernelLoader()
        kernel = kernel_loader.load()
    """

    REGISTRY: List[_Extension] = []

    @classmethod
    def register_extension(cls, extension: _Extension):
        """
        This classmethod is an extension point which allows users to register their customized
        kernel implementations to the loader.

        Args:
            extension (_Extension): the extension to be registered.
        """
        cls.REGISTRY.append(extension)

    def load(self, ext_name: str = None):
        """
        Load the kernel according to the current machine.

        Args:
            ext_name (str): the name of the extension to be loaded. If not specified, the loader
                will try to look for an kernel available on the current machine.
        """
        exts = [ext_cls() for ext_cls in self.__class__.REGISTRY]

        # look for exts which can be built/loaded on the current machine

        if ext_name:
            usable_exts = list(filter(lambda ext: ext.name == ext_name, exts))
        else:
            usable_exts = []
            for ext in exts:
                if ext.is_available():
                    # make sure the machine is compatible during kernel loading
                    ext.assert_compatible()
                    usable_exts.append(ext)

        assert len(usable_exts) != 0, f"No usable kernel found for {self.__class__.__name__} on the current machine."

        if len(usable_exts) > 1:
            # if more than one usable kernel is found, we will try to load the kernel with the highest priority
            usable_exts = sorted(usable_exts, key=lambda ext: ext.priority, reverse=True)
            warnings.warn(
                f"More than one kernel is available, loading the kernel with the highest priority - {usable_exts[0].__class__.__name__}"
            )
        return usable_exts[0].load()


class CPUAdamLoader(KernelLoader):
    REGISTRY = [CpuAdamX86Extension, CpuAdamArmExtension]


class LayerNormLoader(KernelLoader):
    REGISTRY = [LayerNormCudaExtension]


class MoeLoader(KernelLoader):
    REGISTRY = [MoeCudaExtension]


class FusedOptimizerLoader(KernelLoader):
    REGISTRY = [FusedOptimizerCudaExtension]


class InferenceOpsLoader(KernelLoader):
    REGISTRY = [InferenceOpsCudaExtension]


class ScaledMaskedSoftmaxLoader(KernelLoader):
    REGISTRY = [ScaledMaskedSoftmaxCudaExtension]


class ScaledUpperTriangleMaskedSoftmaxLoader(KernelLoader):
    REGISTRY = [ScaledUpperTriangleMaskedSoftmaxCudaExtension]


class FlashAttentionLoader(KernelLoader):
    REGISTRY = [
        FlashAttentionNpuExtension,
        FlashAttentionDaoCudaExtension,
        FlashAttentionSdpaCudaExtension,
    ]


class FlashAttentionDaoLoader(KernelLoader):
    REGISTRY = [FlashAttentionDaoCudaExtension]


class FlashAttentionWithCustomMaskLoader(KernelLoader):
    REGISTRY = [FlashAttentionNpuExtension, FlashAttentionSdpaCudaExtension]


class FlashAttentionForFloatAndCustomMaskLoader(KernelLoader):
    REGISTRY = [FlashAttentionSdpaCudaExtension]
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`import warnings`
			`from typing import List`

			`from .extensions import (`
			`CpuAdamArmExtension,`
			`CpuAdamX86Extension,`
			`FlashAttentionDaoCudaExtension,`
			`FlashAttentionNpuExtension,`
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago			`FlashAttentionSdpaCudaExtension,`
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`FusedOptimizerCudaExtension,`
[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`InferenceOpsCudaExtension,`
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`LayerNormCudaExtension,`
			`MoeCudaExtension,`
			`ScaledMaskedSoftmaxCudaExtension,`
			`ScaledUpperTriangleMaskedSoftmaxCudaExtension,`
			`)`
			`from .extensions.base_extension import _Extension`

			`__all__ = [`
			`"KernelLoader",`
			`"CPUAdamLoader",`
			`"LayerNormLoader",`
			`"MoeLoader",`
			`"FusedOptimizerLoader",`
[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`"InferenceOpsLoader",`
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`"ScaledMaskedSoftmaxLoader",`
			`"ScaledUpperTriangleMaskedSoftmaxLoader",`
			`]`


			`class KernelLoader:`
			`"""`
			`An abstract class which offers encapsulation to the kernel loading process.`

			`Usage:`
			`kernel_loader = KernelLoader()`
			`kernel = kernel_loader.load()`
			`"""`

			`REGISTRY: List[_Extension] = []`

			`@classmethod`
			`def register_extension(cls, extension: _Extension):`
			`"""`
			`This classmethod is an extension point which allows users to register their customized`
			`kernel implementations to the loader.`

			`Args:`
			`extension (_Extension): the extension to be registered.`
			`"""`
			`cls.REGISTRY.append(extension)`

			`def load(self, ext_name: str = None):`
			`"""`
			`Load the kernel according to the current machine.`

			`Args:`
			`ext_name (str): the name of the extension to be loaded. If not specified, the loader`
			`will try to look for an kernel available on the current machine.`
			`"""`
			`exts = [ext_cls() for ext_cls in self.__class__.REGISTRY]`

			`# look for exts which can be built/loaded on the current machine`

			`if ext_name:`
			`usable_exts = list(filter(lambda ext: ext.name == ext_name, exts))`
			`else:`
			`usable_exts = []`
			`for ext in exts:`
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago			`if ext.is_available():`
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`# make sure the machine is compatible during kernel loading`
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago			`ext.assert_compatible()`
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`usable_exts.append(ext)`

			`assert len(usable_exts) != 0, f"No usable kernel found for {self.__class__.__name__} on the current machine."`

			`if len(usable_exts) > 1:`
			`# if more than one usable kernel is found, we will try to load the kernel with the highest priority`
			`usable_exts = sorted(usable_exts, key=lambda ext: ext.priority, reverse=True)`
			`warnings.warn(`
			`f"More than one kernel is available, loading the kernel with the highest priority - {usable_exts[0].__class__.__name__}"`
			`)`
			`return usable_exts[0].load()`


			`class CPUAdamLoader(KernelLoader):`
			`REGISTRY = [CpuAdamX86Extension, CpuAdamArmExtension]`


			`class LayerNormLoader(KernelLoader):`
			`REGISTRY = [LayerNormCudaExtension]`


			`class MoeLoader(KernelLoader):`
			`REGISTRY = [MoeCudaExtension]`


			`class FusedOptimizerLoader(KernelLoader):`
			`REGISTRY = [FusedOptimizerCudaExtension]`


[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`class InferenceOpsLoader(KernelLoader):`
			`REGISTRY = [InferenceOpsCudaExtension]`


[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`class ScaledMaskedSoftmaxLoader(KernelLoader):`
			`REGISTRY = [ScaledMaskedSoftmaxCudaExtension]`


			`class ScaledUpperTriangleMaskedSoftmaxLoader(KernelLoader):`
			`REGISTRY = [ScaledUpperTriangleMaskedSoftmaxCudaExtension]`


			`class FlashAttentionLoader(KernelLoader):`
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago			`REGISTRY = [`
			`FlashAttentionNpuExtension,`
			`FlashAttentionDaoCudaExtension,`
			`FlashAttentionSdpaCudaExtension,`
			`]`

[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 3 months ago
fix the sp 3 months ago			`class FlashAttentionDaoLoader(KernelLoader):`
			`REGISTRY = [FlashAttentionDaoCudaExtension]`
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 3 months ago
[shardformer] update colo attention to support custom mask (#5510) * [feature] refactor colo attention (#5462) * [extension] update api * [feature] add colo attention * [feature] update sdpa * [feature] update npu attention * [feature] update flash-attn * [test] add flash attn test * [test] update flash attn test * [shardformer] update modeling to fit colo attention (#5465) * [misc] refactor folder structure * [shardformer] update llama flash-attn * [shardformer] fix llama policy * [devops] update tensornvme install * [test] update llama test * [shardformer] update colo attn kernel dispatch * [shardformer] update blip2 * [shardformer] update chatglm * [shardformer] update gpt2 * [shardformer] update gptj * [shardformer] update opt * [shardformer] update vit * [shardformer] update colo attention mask prep * [shardformer] update whisper * [test] fix shardformer tests (#5514) * [test] fix shardformer tests * [test] fix shardformer tests 8 months ago			`class FlashAttentionWithCustomMaskLoader(KernelLoader):`
			`REGISTRY = [FlashAttentionNpuExtension, FlashAttentionSdpaCudaExtension]`


			`class FlashAttentionForFloatAndCustomMaskLoader(KernelLoader):`
			`REGISTRY = [FlashAttentionSdpaCudaExtension]`