ColossalAI/colossalai/kernel/cuda_native/csrc/kernels/transform_kernels.cu

#include <cub/block/block_load.cuh>
#include <cub/block/block_scan.cuh>
#include <cub/block/block_store.cuh>

#include "kernels.h"

using namespace cub;

/**
@brief: transform_0213
Split the attention heads and reshape input
during backward progress of encoder self-attention

@thread
gridDim.x = batch_size
gridDim.y = seq_len
blockDim.x = min(hidden_dim, MAX_THREADS)

@param
input: [batch_size, seq_len, hidden_dim]
output: [batch_size, nhead, seq_len, head_dim]
batch_size: the size of the current batch
seq_len: the sequence length of the current batch
hidden_dim: dim of the hidden tensor
nhead: number of attention heads
*/

template <typename T>
__global__ void transform_0213(T *output, const T *input, int hidden_dim,
                               int head_dim);

template <>
__global__ void transform_0213<float>(float *output, const float *input,
                                      int hidden_dim, int head_dim) {
  int batch_id = blockIdx.x;
  int token_id = blockIdx.y;
  int seq_len = gridDim.y;
  int nhead = hidden_dim / head_dim;

  // [b, s, h]
  int src_offset = flat_3dim(batch_id, token_id, 0, seq_len, hidden_dim);
  // [b, nh, s, ad]
  int trg_offset =
      flat_4dim(batch_id, 0, token_id, 0, nhead, seq_len, head_dim);

  const float4 *input4 = reinterpret_cast<const float4 *>(input);
  float4 *res4 = reinterpret_cast<float4 *>(output);
  float4 vinput4;

  for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
    vinput4 = input4[src_offset + i];

    int head_id = i / head_dim;
    int dim_id = i % head_dim;
    int cur_trg_offset = flat_3dim(head_id, 0, dim_id, seq_len, head_dim);
    res4[trg_offset + cur_trg_offset] = vinput4;
  }
}

template <>
__global__ void transform_0213<__half>(__half *output, const __half *input,
                                       int hidden_dim, int head_dim) {
  int batch_id = blockIdx.x;
  int token_id = blockIdx.y;
  int seq_len = gridDim.y;
  int nhead = hidden_dim / head_dim;

  // [b, s, h]
  int src_offset = flat_3dim(batch_id, token_id, 0, seq_len, hidden_dim);
  // [b, nh, s, ad]
  int trg_offset =
      flat_4dim(batch_id, 0, token_id, 0, nhead, seq_len, head_dim);

  const float4 *input4 = reinterpret_cast<const float4 *>(input);
  float4 *res4 = reinterpret_cast<float4 *>(output);
  float4 vinput4;

  for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {
    vinput4 = input4[src_offset + i];

    int head_id = i / head_dim;
    int dim_id = i % head_dim;
    int cur_trg_offset = flat_3dim(head_id, 0, dim_id, seq_len, head_dim);
    res4[trg_offset + cur_trg_offset] = vinput4;
  }
}

// [b, s, h] -> [b, nh, s, ad]
template <>
void launch_transform_0213<float>(float *output, const float *input,
                                  int batch_size, int seq_len, int hidden_dim,
                                  int nhead, cudaStream_t stream) {
  hidden_dim >>= 2;
  int head_dim = hidden_dim / nhead;

  dim3 grid_dim(batch_size, seq_len);
  dim3 block_dim(min(hidden_dim, MAX_THREADS));

  transform_0213<float>
      <<<grid_dim, block_dim, 0, stream>>>(output, input, hidden_dim, head_dim);
}

template <>
void launch_transform_0213<__half>(__half *output, const __half *input,
                                   int batch_size, int seq_len, int hidden_dim,
                                   int nhead, cudaStream_t stream) {
  hidden_dim >>= 3;
  int head_dim = hidden_dim / nhead;

  dim3 grid_dim(batch_size, seq_len);
  dim3 block_dim(min(hidden_dim, MAX_THREADS));

  transform_0213<__half>
      <<<grid_dim, block_dim, 0, stream>>>(output, input, hidden_dim, head_dim);
}

/**
@brief: bias_add_transform_20314
Add bias to input, transform from
[0, 1, 2, 3, 4] to [2, 0, 3, 1, 4]

@thread
gridDim.x = dim_0
gridDim.y = dim_1
gridDim.z = dim_2
blockDim.x = min(dim_3 * dim_4, MAX_THREADS)

@param
input: [dim_0, dim_1, dim_2, dim_3, dim_4]
bias: [dim_2, dim_3, dim_4]
output: [dim_2, dim_0, dim_3, dim_1, dim_4]
*/
template <typename T>
__global__ void bias_add_transform_20314(T *output, const T *input,
                                         const T *bias, int dim_3, int dim_4);

template <>
__global__ void
bias_add_transform_20314<float>(float *output, const float *input,
                                const float *bias, int dim_3, int dim_4) {
  int id0 = blockIdx.x;
  int id1 = blockIdx.y;
  int id2 = blockIdx.z;
  int dim_0 = gridDim.x;
  int dim_1 = gridDim.y;
  int dim_2 = gridDim.z;
  int dim_34 = dim_3 * dim_4;

  int src_offset = flat_4dim(id0, id1, id2, 0, dim_1, dim_2, dim_34);
  int trg_offset = flat_5dim(id2, id0, 0, id1, 0, dim_0, dim_3, dim_1, dim_4);
  int bias_offset = flat_2dim(id2, 0, dim_34);

  const float4 *qkv4 = reinterpret_cast<const float4 *>(input);
  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
  float4 *res4 = reinterpret_cast<float4 *>(output);
  float4 vqkv4;
  float4 vbias4;
  float4 vres4;

  for (std::size_t i = threadIdx.x; i < dim_34; i += blockDim.x) {
    vqkv4 = qkv4[src_offset + i];
    vbias4 = bias4[bias_offset + i];
    vres4.x = vqkv4.x + vbias4.x;
    vres4.y = vqkv4.y + vbias4.y;
    vres4.z = vqkv4.z + vbias4.z;
    vres4.w = vqkv4.w + vbias4.w;

    int id3 = i / dim_4;
    int id4 = i % dim_4;
    int cur_trg_offset = flat_3dim(id3, 0, id4, dim_1, dim_4);
    res4[trg_offset + cur_trg_offset] = vres4;
  }
}

template <>
__global__ void
bias_add_transform_20314<__half>(__half *output, const __half *input,
                                 const __half *bias, int dim_3, int dim_4) {
  int id0 = blockIdx.x;
  int id1 = blockIdx.y;
  int id2 = blockIdx.z;
  int dim_0 = gridDim.x;
  int dim_1 = gridDim.y;
  int dim_2 = gridDim.z;
  int dim_34 = dim_3 * dim_4;

  int src_offset = flat_4dim(id0, id1, id2, 0, dim_1, dim_2, dim_34);
  int trg_offset = flat_5dim(id2, id0, 0, id1, 0, dim_0, dim_3, dim_1, dim_4);
  int bias_offset = flat_2dim(id2, 0, dim_34);

  const float4 *qkv4 = reinterpret_cast<const float4 *>(input);
  const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
  float4 *res4 = reinterpret_cast<float4 *>(output);
  float4 vqkv4;
  float4 vbias4;
  float4 vres4;
  __half2 *h2_qkv = reinterpret_cast<__half2 *>(&vqkv4);
  __half2 *h2_bias = reinterpret_cast<__half2 *>(&vbias4);
  __half2 *h2_res = reinterpret_cast<__half2 *>(&vres4);

  for (std::size_t i = threadIdx.x; i < dim_34; i += blockDim.x) {
    vqkv4 = qkv4[src_offset + i];
    vbias4 = bias4[bias_offset + i];
    h2_res[0] = __hadd2(h2_qkv[0], h2_bias[0]);
    h2_res[1] = __hadd2(h2_qkv[1], h2_bias[1]);
    h2_res[2] = __hadd2(h2_qkv[2], h2_bias[2]);
    h2_res[3] = __hadd2(h2_qkv[3], h2_bias[3]);

    int id3 = i / dim_4;
    int id4 = i % dim_4;
    int cur_trg_offset = flat_3dim(id3, 0, id4, dim_1, dim_4);
    res4[trg_offset + cur_trg_offset] = vres4;
  }
}

// [b, s, 3, h] -> [3, b, nh, s, ad]
template <>
void launch_bias_add_transform_20314<float>(float *output, const float *input,
                                            const float *bias, int dim_0,
                                            int dim_1, int dim_2, int dim_3,
                                            int dim_4, cudaStream_t stream) {
  dim_4 >>= 2;

  dim3 grid_dim(dim_0, dim_1, dim_2);
  dim3 block_dim(min(dim_3 * dim_4, MAX_THREADS));

  bias_add_transform_20314<float>
      <<<grid_dim, block_dim, 0, stream>>>(output, input, bias, dim_3, dim_4);
}

template <>
void launch_bias_add_transform_20314<__half>(__half *output,
                                             const __half *input,
                                             const __half *bias, int dim_0,
                                             int dim_1, int dim_2, int dim_3,
                                             int dim_4, cudaStream_t stream) {
  dim_4 >>= 3;

  dim3 grid_dim(dim_0, dim_1, dim_2);
  dim3 block_dim(min(dim_3 * dim_4, MAX_THREADS));

  bias_add_transform_20314<__half>
      <<<grid_dim, block_dim, 0, stream>>>(output, input, bias, dim_3, dim_4);
}

/**
@brief: transform4d_0213
Reshape the input matrix to merge the heads

@thread
gridDim.x = (num_all + max_block_thread - 1) / max_block_thread
blockDim.x = max_block_thread

@param
input: [trans_count, batch_size, nhead, seq_len, head_dim]
output: [batch_size, seq_len, trans_count, nhead, head_dim]
batch_size: the size of the current batch
seq_len: the sequence length of the current batch
hidden_dim: dim of the hidden tensor
nhead: number of attention heads
trans_count: 1 or 3, the count of matrice need to be transformed
*/
template <typename T>
__global__ void transform4d_0213(T *output, const T *input, int batch_size,
                                 int seq_len, int trans_count, int nhead,
                                 int head_dim, int num_all) {
  int offset = blockIdx.x * blockDim.x + threadIdx.x;
  if (offset >= num_all) {
    return;
  }
  int trans_id, batch_id, head_id, token_id, dim_id;
  decompose_5dim(offset, batch_size, nhead, seq_len, head_dim, &trans_id,
                 &batch_id, &head_id, &token_id, &dim_id);
  // [b, s, tc, nh, ad]
  int trg_offset = flat_5dim(batch_id, token_id, trans_id, head_id, dim_id,
                             seq_len, trans_count, nhead, head_dim);

  const float4 *input4 = reinterpret_cast<const float4 *>(input);
  float4 *res4 = reinterpret_cast<float4 *>(output);
  res4[trg_offset] = input4[offset];
}

// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]
template <>
void launch_transform4d_0213<float>(float *output, const float *input,
                                    int batch_size, int seq_len, int hidden_dim,
                                    int nhead, int trans_count,
                                    cudaStream_t stream) {
  hidden_dim >>= 2;
  int head_dim = hidden_dim / nhead;
  int num_all = batch_size * seq_len * trans_count * hidden_dim;
  int nblock = (num_all + MAX_THREADS - 1) / MAX_THREADS;

  transform4d_0213<float><<<nblock, MAX_THREADS, 0, stream>>>(
      output, input, batch_size, seq_len, trans_count, nhead, head_dim,
      num_all);
}

template <>
void launch_transform4d_0213<__half>(__half *output, const __half *input,
                                     int batch_size, int seq_len,
                                     int hidden_dim, int nhead, int trans_count,
                                     cudaStream_t stream) {
  hidden_dim >>= 3;
  int head_dim = hidden_dim / nhead;
  int num_all = batch_size * seq_len * trans_count * hidden_dim;
  int nblock = (num_all + MAX_THREADS - 1) / MAX_THREADS;

  transform4d_0213<__half><<<nblock, MAX_THREADS, 0, stream>>>(
      output, input, batch_size, seq_len, trans_count, nhead, head_dim,
      num_all);
}
add colossalai kernel module (#55) 2021-12-21 04:19:52 +00:00			`#include <cub/block/block_load.cuh>`
			`#include <cub/block/block_scan.cuh>`
			`#include <cub/block/block_store.cuh>`

			`#include "kernels.h"`

			`using namespace cub;`

			`/**`
			`@brief: transform_0213`
			`Split the attention heads and reshape input`
			`during backward progress of encoder self-attention`

			`@thread`
			`gridDim.x = batch_size`
			`gridDim.y = seq_len`
			`blockDim.x = min(hidden_dim, MAX_THREADS)`

			`@param`
			`input: [batch_size, seq_len, hidden_dim]`
			`output: [batch_size, nhead, seq_len, head_dim]`
			`batch_size: the size of the current batch`
			`seq_len: the sequence length of the current batch`
			`hidden_dim: dim of the hidden tensor`
			`nhead: number of attention heads`
			`*/`

			`template <typename T>`
			`__global__ void transform_0213(T output, const T input, int hidden_dim,`
			`int head_dim);`

			`template <>`
			`__global__ void transform_0213<float>(float output, const float input,`
			`int hidden_dim, int head_dim) {`
			`int batch_id = blockIdx.x;`
			`int token_id = blockIdx.y;`
			`int seq_len = gridDim.y;`
			`int nhead = hidden_dim / head_dim;`

			`// [b, s, h]`
			`int src_offset = flat_3dim(batch_id, token_id, 0, seq_len, hidden_dim);`
			`// [b, nh, s, ad]`
			`int trg_offset =`
			`flat_4dim(batch_id, 0, token_id, 0, nhead, seq_len, head_dim);`

			`const float4 input4 = reinterpret_cast<const float4 >(input);`
			`float4 res4 = reinterpret_cast<float4 >(output);`
			`float4 vinput4;`

			`for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {`
			`vinput4 = input4[src_offset + i];`

			`int head_id = i / head_dim;`
			`int dim_id = i % head_dim;`
			`int cur_trg_offset = flat_3dim(head_id, 0, dim_id, seq_len, head_dim);`
			`res4[trg_offset + cur_trg_offset] = vinput4;`
			`}`
			`}`

			`template <>`
			`__global__ void transform_0213<__half>(__half output, const __half input,`
			`int hidden_dim, int head_dim) {`
			`int batch_id = blockIdx.x;`
			`int token_id = blockIdx.y;`
			`int seq_len = gridDim.y;`
			`int nhead = hidden_dim / head_dim;`

			`// [b, s, h]`
			`int src_offset = flat_3dim(batch_id, token_id, 0, seq_len, hidden_dim);`
			`// [b, nh, s, ad]`
			`int trg_offset =`
			`flat_4dim(batch_id, 0, token_id, 0, nhead, seq_len, head_dim);`

			`const float4 input4 = reinterpret_cast<const float4 >(input);`
			`float4 res4 = reinterpret_cast<float4 >(output);`
			`float4 vinput4;`

			`for (std::size_t i = threadIdx.x; i < hidden_dim; i += blockDim.x) {`
			`vinput4 = input4[src_offset + i];`

			`int head_id = i / head_dim;`
			`int dim_id = i % head_dim;`
			`int cur_trg_offset = flat_3dim(head_id, 0, dim_id, seq_len, head_dim);`
			`res4[trg_offset + cur_trg_offset] = vinput4;`
			`}`
			`}`

			`// [b, s, h] -> [b, nh, s, ad]`
			`template <>`
			`void launch_transform_0213<float>(float output, const float input,`
			`int batch_size, int seq_len, int hidden_dim,`
			`int nhead, cudaStream_t stream) {`
			`hidden_dim >>= 2;`
			`int head_dim = hidden_dim / nhead;`

			`dim3 grid_dim(batch_size, seq_len);`
			`dim3 block_dim(min(hidden_dim, MAX_THREADS));`

			`transform_0213<float>`
			`<<<grid_dim, block_dim, 0, stream>>>(output, input, hidden_dim, head_dim);`
			`}`

			`template <>`
			`void launch_transform_0213<__half>(__half output, const __half input,`
			`int batch_size, int seq_len, int hidden_dim,`
			`int nhead, cudaStream_t stream) {`
			`hidden_dim >>= 3;`
			`int head_dim = hidden_dim / nhead;`

			`dim3 grid_dim(batch_size, seq_len);`
			`dim3 block_dim(min(hidden_dim, MAX_THREADS));`

			`transform_0213<__half>`
			`<<<grid_dim, block_dim, 0, stream>>>(output, input, hidden_dim, head_dim);`
			`}`

			`/**`
			`@brief: bias_add_transform_20314`
			`Add bias to input, transform from`
			`[0, 1, 2, 3, 4] to [2, 0, 3, 1, 4]`

			`@thread`
			`gridDim.x = dim_0`
			`gridDim.y = dim_1`
			`gridDim.z = dim_2`
			`blockDim.x = min(dim_3 * dim_4, MAX_THREADS)`

			`@param`
			`input: [dim_0, dim_1, dim_2, dim_3, dim_4]`
			`bias: [dim_2, dim_3, dim_4]`
			`output: [dim_2, dim_0, dim_3, dim_1, dim_4]`
			`*/`
			`template <typename T>`
			`__global__ void bias_add_transform_20314(T output, const T input,`
			`const T *bias, int dim_3, int dim_4);`

			`template <>`
[NFC] polish colossalai/kernel/cuda_native/csrc/kernels/transform_kernels.cu code stype (#629) 2022-04-01 12:20:54 +00:00			`__global__ void`
			`bias_add_transform_20314<float>(float output, const float input,`
			`const float *bias, int dim_3, int dim_4) {`
add colossalai kernel module (#55) 2021-12-21 04:19:52 +00:00			`int id0 = blockIdx.x;`
			`int id1 = blockIdx.y;`
			`int id2 = blockIdx.z;`
			`int dim_0 = gridDim.x;`
			`int dim_1 = gridDim.y;`
			`int dim_2 = gridDim.z;`
			`int dim_34 = dim_3 * dim_4;`

			`int src_offset = flat_4dim(id0, id1, id2, 0, dim_1, dim_2, dim_34);`
			`int trg_offset = flat_5dim(id2, id0, 0, id1, 0, dim_0, dim_3, dim_1, dim_4);`
			`int bias_offset = flat_2dim(id2, 0, dim_34);`

			`const float4 qkv4 = reinterpret_cast<const float4 >(input);`
			`const float4 bias4 = reinterpret_cast<const float4 >(bias);`
			`float4 res4 = reinterpret_cast<float4 >(output);`
			`float4 vqkv4;`
			`float4 vbias4;`
			`float4 vres4;`

			`for (std::size_t i = threadIdx.x; i < dim_34; i += blockDim.x) {`
			`vqkv4 = qkv4[src_offset + i];`
			`vbias4 = bias4[bias_offset + i];`
			`vres4.x = vqkv4.x + vbias4.x;`
			`vres4.y = vqkv4.y + vbias4.y;`
			`vres4.z = vqkv4.z + vbias4.z;`
			`vres4.w = vqkv4.w + vbias4.w;`

			`int id3 = i / dim_4;`
			`int id4 = i % dim_4;`
			`int cur_trg_offset = flat_3dim(id3, 0, id4, dim_1, dim_4);`
			`res4[trg_offset + cur_trg_offset] = vres4;`
			`}`
			`}`

			`template <>`
[NFC] polish colossalai/kernel/cuda_native/csrc/kernels/transform_kernels.cu code stype (#629) 2022-04-01 12:20:54 +00:00			`__global__ void`
			`bias_add_transform_20314<__half>(__half output, const __half input,`
			`const __half *bias, int dim_3, int dim_4) {`
add colossalai kernel module (#55) 2021-12-21 04:19:52 +00:00			`int id0 = blockIdx.x;`
			`int id1 = blockIdx.y;`
			`int id2 = blockIdx.z;`
			`int dim_0 = gridDim.x;`
			`int dim_1 = gridDim.y;`
			`int dim_2 = gridDim.z;`
			`int dim_34 = dim_3 * dim_4;`

			`int src_offset = flat_4dim(id0, id1, id2, 0, dim_1, dim_2, dim_34);`
			`int trg_offset = flat_5dim(id2, id0, 0, id1, 0, dim_0, dim_3, dim_1, dim_4);`
			`int bias_offset = flat_2dim(id2, 0, dim_34);`

			`const float4 qkv4 = reinterpret_cast<const float4 >(input);`
			`const float4 bias4 = reinterpret_cast<const float4 >(bias);`
			`float4 res4 = reinterpret_cast<float4 >(output);`
			`float4 vqkv4;`
			`float4 vbias4;`
			`float4 vres4;`
			`__half2 h2_qkv = reinterpret_cast<__half2 >(&vqkv4);`
			`__half2 h2_bias = reinterpret_cast<__half2 >(&vbias4);`
			`__half2 h2_res = reinterpret_cast<__half2 >(&vres4);`

			`for (std::size_t i = threadIdx.x; i < dim_34; i += blockDim.x) {`
			`vqkv4 = qkv4[src_offset + i];`
			`vbias4 = bias4[bias_offset + i];`
			`h2_res[0] = __hadd2(h2_qkv[0], h2_bias[0]);`
			`h2_res[1] = __hadd2(h2_qkv[1], h2_bias[1]);`
			`h2_res[2] = __hadd2(h2_qkv[2], h2_bias[2]);`
			`h2_res[3] = __hadd2(h2_qkv[3], h2_bias[3]);`

			`int id3 = i / dim_4;`
			`int id4 = i % dim_4;`
			`int cur_trg_offset = flat_3dim(id3, 0, id4, dim_1, dim_4);`
			`res4[trg_offset + cur_trg_offset] = vres4;`
			`}`
			`}`

			`// [b, s, 3, h] -> [3, b, nh, s, ad]`
			`template <>`
			`void launch_bias_add_transform_20314<float>(float output, const float input,`
			`const float *bias, int dim_0,`
			`int dim_1, int dim_2, int dim_3,`
			`int dim_4, cudaStream_t stream) {`
			`dim_4 >>= 2;`

			`dim3 grid_dim(dim_0, dim_1, dim_2);`
			`dim3 block_dim(min(dim_3 * dim_4, MAX_THREADS));`

			`bias_add_transform_20314<float>`
			`<<<grid_dim, block_dim, 0, stream>>>(output, input, bias, dim_3, dim_4);`
			`}`

			`template <>`
			`void launch_bias_add_transform_20314<__half>(__half *output,`
			`const __half *input,`
			`const __half *bias, int dim_0,`
			`int dim_1, int dim_2, int dim_3,`
			`int dim_4, cudaStream_t stream) {`
			`dim_4 >>= 3;`

			`dim3 grid_dim(dim_0, dim_1, dim_2);`
			`dim3 block_dim(min(dim_3 * dim_4, MAX_THREADS));`

			`bias_add_transform_20314<__half>`
			`<<<grid_dim, block_dim, 0, stream>>>(output, input, bias, dim_3, dim_4);`
			`}`

			`/**`
			`@brief: transform4d_0213`
			`Reshape the input matrix to merge the heads`

			`@thread`
			`gridDim.x = (num_all + max_block_thread - 1) / max_block_thread`
			`blockDim.x = max_block_thread`

			`@param`
			`input: [trans_count, batch_size, nhead, seq_len, head_dim]`
			`output: [batch_size, seq_len, trans_count, nhead, head_dim]`
			`batch_size: the size of the current batch`
			`seq_len: the sequence length of the current batch`
			`hidden_dim: dim of the hidden tensor`
			`nhead: number of attention heads`
			`trans_count: 1 or 3, the count of matrice need to be transformed`
			`*/`
			`template <typename T>`
			`__global__ void transform4d_0213(T output, const T input, int batch_size,`
			`int seq_len, int trans_count, int nhead,`
			`int head_dim, int num_all) {`
			`int offset = blockIdx.x * blockDim.x + threadIdx.x;`
			`if (offset >= num_all) {`
			`return;`
			`}`
			`int trans_id, batch_id, head_id, token_id, dim_id;`
			`decompose_5dim(offset, batch_size, nhead, seq_len, head_dim, &trans_id,`
			`&batch_id, &head_id, &token_id, &dim_id);`
			`// [b, s, tc, nh, ad]`
			`int trg_offset = flat_5dim(batch_id, token_id, trans_id, head_id, dim_id,`
			`seq_len, trans_count, nhead, head_dim);`

			`const float4 input4 = reinterpret_cast<const float4 >(input);`
			`float4 res4 = reinterpret_cast<float4 >(output);`
			`res4[trg_offset] = input4[offset];`
			`}`

			`// [tc, b, nh, s, ad] -> [b, s, tc, nh, ad]`
			`template <>`
			`void launch_transform4d_0213<float>(float output, const float input,`
			`int batch_size, int seq_len, int hidden_dim,`
			`int nhead, int trans_count,`
			`cudaStream_t stream) {`
			`hidden_dim >>= 2;`
			`int head_dim = hidden_dim / nhead;`
			`int num_all = batch_size * seq_len * trans_count * hidden_dim;`
			`int nblock = (num_all + MAX_THREADS - 1) / MAX_THREADS;`

			`transform4d_0213<float><<<nblock, MAX_THREADS, 0, stream>>>(`
			`output, input, batch_size, seq_len, trans_count, nhead, head_dim,`
			`num_all);`
			`}`

			`template <>`
			`void launch_transform4d_0213<__half>(__half output, const __half input,`
			`int batch_size, int seq_len,`
			`int hidden_dim, int nhead, int trans_count,`
			`cudaStream_t stream) {`
			`hidden_dim >>= 3;`
			`int head_dim = hidden_dim / nhead;`
			`int num_all = batch_size * seq_len * trans_count * hidden_dim;`
			`int nblock = (num_all + MAX_THREADS - 1) / MAX_THREADS;`

			`transform4d_0213<__half><<<nblock, MAX_THREADS, 0, stream>>>(`
			`output, input, batch_size, seq_len, trans_count, nhead, head_dim,`
			`num_all);`
			`}`