From 16cc8e6aa750d7cee77cc2d0b7b897b5998f9ebf Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Tue, 3 Jan 2023 20:29:39 +0800
Subject: [PATCH] [builder] MOE builder (#2277)

---
 colossalai/kernel/__init__.py            | 16 ++++++++++--
 colossalai/kernel/op_builder/__init__.py |  3 ++-
 colossalai/kernel/op_builder/builder.py  |  4 +--
 colossalai/kernel/op_builder/moe.py      | 33 ++++++++++++++++++++++++
 colossalai/nn/layer/moe/_operation.py    | 18 +++++--------
 setup.py                                 |  6 ++---
 6 files changed, 60 insertions(+), 20 deletions(-)
 create mode 100644 colossalai/kernel/op_builder/moe.py

diff --git a/colossalai/kernel/__init__.py b/colossalai/kernel/__init__.py
index 37735fc8d..02d000362 100644
--- a/colossalai/kernel/__init__.py
+++ b/colossalai/kernel/__init__.py
@@ -24,7 +24,19 @@ except ImportError:
     from colossalai.kernel.op_builder import ScaledSoftmaxBuilder
     scaled_upper_triang_masked_softmax = ScaledSoftmaxBuilder().load()
 
+try:
+    from colossalai._C import moe
+except ImportError:
+    from colossalai.kernel.op_builder import MOEBuilder
+    moe = MOEBuilder().load()
+
 __all__ = [
-    "fused_optim", "cpu_optim", "multihead_attention", "LayerNorm", "FusedScaleMaskSoftmax", "MultiHeadAttention",
-    "scaled_upper_triang_masked_softmax"
+    "fused_optim",
+    "cpu_optim",
+    "multihead_attention",
+    "moe",
+    "LayerNorm",
+    "FusedScaleMaskSoftmax",
+    "MultiHeadAttention",
+    "scaled_upper_triang_masked_softmax",
 ]
diff --git a/colossalai/kernel/op_builder/__init__.py b/colossalai/kernel/op_builder/__init__.py
index 7ee7a8ab3..08832fc55 100644
--- a/colossalai/kernel/op_builder/__init__.py
+++ b/colossalai/kernel/op_builder/__init__.py
@@ -1,6 +1,7 @@
 from .cpu_adam import CPUAdamBuilder
 from .fused_optim import FusedOptimBuilder
+from .moe import MOEBuilder
 from .multi_head_attn import MultiHeadAttnBuilder
 from .scaled_upper_triang_masked_softmax import ScaledSoftmaxBuilder
 
-__all__ = ['CPUAdamBuilder', 'FusedOptimBuilder', 'MultiHeadAttnBuilder', 'ScaledSoftmaxBuilder']
+__all__ = ['CPUAdamBuilder', 'FusedOptimBuilder', 'MultiHeadAttnBuilder', 'ScaledSoftmaxBuilder', 'MOEBuilder']
diff --git a/colossalai/kernel/op_builder/builder.py b/colossalai/kernel/op_builder/builder.py
index 3c64c3d59..18c41b0ce 100644
--- a/colossalai/kernel/op_builder/builder.py
+++ b/colossalai/kernel/op_builder/builder.py
@@ -1,12 +1,12 @@
 import os
 import re
-import sys
 from pathlib import Path
+from typing import List
 
 import torch
 
 
-def get_cuda_cc_flag():
+def get_cuda_cc_flag() -> List:
     """get_cuda_cc_flag
 
     cc flag for your GPU arch
diff --git a/colossalai/kernel/op_builder/moe.py b/colossalai/kernel/op_builder/moe.py
new file mode 100644
index 000000000..5f74e1a72
--- /dev/null
+++ b/colossalai/kernel/op_builder/moe.py
@@ -0,0 +1,33 @@
+import os
+
+from .builder import Builder, get_cuda_cc_flag
+
+
+class MOEBuilder(Builder):
+
+    def __init__(self):
+        self.base_dir = "cuda_native/csrc"
+        self.name = 'moe'
+        super().__init__()
+
+    def include_dirs(self):
+        ret = []
+        ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_home_include()]
+        ret.append(os.path.join(self.base_dir, "kernels", "include"))
+        return [self.colossalai_src_path(path) for path in ret]
+
+    def sources_files(self):
+        ret = [os.path.join(self.base_dir, fname) for fname in ['moe_cuda.cpp', 'moe_cuda_kernel.cu']]
+        return [self.colossalai_src_path(path) for path in ret]
+
+    def cxx_flags(self):
+        return ['-O3', '-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
+
+    def nvcc_flags(self):
+        extra_cuda_flags = [
+            '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr',
+            '--expt-extended-lambda'
+        ]
+        extra_cuda_flags.extend(get_cuda_cc_flag())
+        ret = ['-O3', '--use_fast_math'] + extra_cuda_flags
+        return ret
diff --git a/colossalai/nn/layer/moe/_operation.py b/colossalai/nn/layer/moe/_operation.py
index 278cdfbb7..d06025db1 100644
--- a/colossalai/nn/layer/moe/_operation.py
+++ b/colossalai/nn/layer/moe/_operation.py
@@ -6,12 +6,7 @@ from torch import Tensor
 from torch.distributed import ProcessGroup
 
 COL_MOE_KERNEL_FLAG = False
-try:
-    import colossalai._C.moe
-
-    COL_MOE_KERNEL_FLAG = True
-except ImportError:
-    print("If you want to activate cuda mode for MoE, please install with cuda_ext!")
+from colossalai.kernel import moe
 
 
 class AllGather(torch.autograd.Function):
@@ -90,7 +85,7 @@ class MoeDispatch(torch.autograd.Function):
         s = tokens.size(0)
         h = tokens.size(1)
 
-        expert_input = colossalai._C.moe.dispatch_forward(s, ec, h, tokens, mask, dest_idx)
+        expert_input = moe.dispatch_forward(s, ec, h, tokens, mask, dest_idx)
 
         ctx.save_for_backward(mask, dest_idx)
         ctx.s = s
@@ -102,7 +97,7 @@ class MoeDispatch(torch.autograd.Function):
     @staticmethod
     def backward(ctx, output_grad):
         mask, dest_idx = ctx.saved_tensors
-        d_tokens = colossalai._C.moe.dispatch_backward(ctx.s, ctx.ec, ctx.h, output_grad, mask, dest_idx)
+        d_tokens = moe.dispatch_backward(ctx.s, ctx.ec, ctx.h, output_grad, mask, dest_idx)
         return d_tokens, None, None, None
 
 
@@ -119,7 +114,7 @@ class MoeCombine(torch.autograd.Function):
 
         fp16_flag = (expert_tokens.dtype == torch.float16)
         cb_input = expert_tokens.to(torch.float32) if fp16_flag else expert_tokens
-        ctokens = colossalai._C.moe.combine_forward(s, e, c, h, cb_input, logits, mask, dest_idx)
+        ctokens = moe.combine_forward(s, e, c, h, cb_input, logits, mask, dest_idx)
         output = ctokens.to(torch.float16) if fp16_flag else ctokens
 
         ctx.save_for_backward(expert_tokens, logits, mask, dest_idx)
@@ -138,8 +133,7 @@ class MoeCombine(torch.autograd.Function):
         cb_grad = tokens_grad.to(torch.float32) if tokens_grad.dtype is torch.float16 \
             else tokens_grad
         cb_input = expert_tokens.to(torch.float32) if ctx.fp16_flag else expert_tokens
-        d_expert, d_logits = colossalai._C.moe.combine_backward(ctx.s, ctx.e, ctx.c, ctx.h, cb_grad, cb_input, logits,
-                                                                mask, dest_idx)
+        d_expert, d_logits = moe.combine_backward(ctx.s, ctx.e, ctx.c, ctx.h, cb_grad, cb_input, logits, mask, dest_idx)
         d_expert = d_expert.to(torch.float16) if ctx.fp16_flag else d_expert
 
         return d_expert, d_logits, None, None, None
@@ -149,6 +143,6 @@ def moe_cumsum(inputs: Tensor):
     dim0 = inputs.size(0)
     flag = (dim0 <= 1024) or (dim0 <= 2048 and dim0 % 2 == 0) or (dim0 % 4 == 0)
     if flag and COL_MOE_KERNEL_FLAG:
-        return colossalai._C.moe.cumsum_sub_one(inputs)
+        return moe.cumsum_sub_one(inputs)
     else:
         return torch.cumsum(inputs, dim=0) - 1
diff --git a/setup.py b/setup.py
index b296970c2..573a94b4f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 import os
 import re
 
-from setuptools import Extension, find_packages, setup
+from setuptools import find_packages, setup
 
 from colossalai.kernel.op_builder.utils import get_cuda_bare_metal_version
 
@@ -161,8 +161,8 @@ if build_cuda_ext:
         cuda_ext_helper('colossalai._C.scaled_masked_softmax',
                         ['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], extra_cuda_flags + cc_flag))
 
-    ext_modules.append(
-        cuda_ext_helper('colossalai._C.moe', ['moe_cuda.cpp', 'moe_cuda_kernel.cu'], extra_cuda_flags + cc_flag))
+    from colossalai.kernel.op_builder import MOEBuilder
+    ext_modules.append(MOEBuilder().builder('colossalai._C.moe'))
 
     extra_cuda_flags = ['-maxrregcount=50']