ColossalAI/colossalai/kernel/cuda_native/csrc/kernels/dropout_kernels.cu

1002 lines
37 KiB
Plaintext

#include <cooperative_groups.h>
#include <chrono>
#include <ctime>
#include "kernels.h"
namespace cg = cooperative_groups;
curandStatePhilox4_32_10_t *curandstate;
/**
* @brief element-wise activation function on device, like Relu, Gelu
*
* @tparam enum class ActivationType, kRelu, kGelu
* @tparam input type
* @param any shape of float and __half2
* @return same shape and type with input
*/
template <ActivationType, typename T>
__forceinline__ __device__ T activation_kernel(T x);
template <>
__device__ float activation_kernel<ActivationType::kGelu, float>(float x) {
float cdf =
0.5f *
(1.0f + tanhf((0.7978845608028654f * (x + 0.044715f * x * x * x))));
return x * cdf;
}
template <>
__device__ __half2
activation_kernel<ActivationType::kGelu, __half2>(__half2 val) {
__half2 val_pow3 = __hmul2(val, __hmul2(val, val));
float2 tmp_pow = __half22float2(val_pow3);
float2 tmp = __half22float2(val);
tmp.x =
0.5f *
(1.0f + tanhf((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
tmp.y =
0.5f *
(1.0f + tanhf((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
return __hmul2(val, __float22half2_rn(tmp));
}
template <>
__device__ float activation_kernel<ActivationType::kRelu, float>(float x) {
return fmaxf(x, 0);
}
template <>
__device__ __half2
activation_kernel<ActivationType::kRelu, __half2>(__half2 x) {
return __floats2half2_rn(fmaxf(0.f, __half2float(x.x)),
fmaxf(0.f, __half2float(x.y)));
}
/**
* @brief element-wise activation backward function on device
*
* @tparam enum class ActivationType
* @tparam input type
* @param any shape of float and __half2
* @return same shape of input
*/
template <ActivationType, typename T>
__forceinline__ __device__ T activation_bwd_kernel(T grad, T x);
template <>
__device__ float activation_bwd_kernel<ActivationType::kGelu, float>(float grad,
float x) {
const float sqrt_param = 0.79788456080286535587989211986876f;
const float mul_param = 0.044715;
float x2mul = x * x * mul_param;
float tan_h = tanhf(sqrt_param * (x + x * x2mul));
float dg1 = 0.5f * (1.0f + tan_h);
float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
float dg3 = dg2 * 3 * x2mul;
return grad * (dg1 + dg2 + dg3);
}
template <>
__device__ __half activation_bwd_kernel<ActivationType::kGelu, __half>(
__half grad, __half x_half) {
float x = __half2float(x_half);
const float sqrt_param = 0.79788456080286535587989211986876f;
const float mul_param = 0.044715;
float x2mul = x * x * mul_param;
float tan_h = tanhf(sqrt_param * (x + x * x2mul));
float dg1 = 0.5f * (1.0f + tan_h);
float dg2 = x * 0.5f * sqrt_param * (1 - tan_h * tan_h);
float dg3 = dg2 * 3 * x2mul;
return grad * __float2half(dg1 + dg2 + dg3);
}
template <>
__device__ float activation_bwd_kernel<ActivationType::kRelu, float>(float grad,
float x) {
return x > 0.f ? grad : 0.f;
}
template <>
__device__ __half
activation_bwd_kernel<ActivationType::kRelu, __half>(__half grad, __half x) {
const __half half_zero = __float2half(0.f);
return x > half_zero ? grad : half_zero;
}
template <>
__device__ __half2 activation_bwd_kernel<ActivationType::kRelu, __half2>(
__half2 grad2, __half2 x_half2) {
const __half half_zero = __float2half(0.f);
return __floats2half2_rn(x_half2.x > half_zero ? grad2.x : half_zero,
x_half2.y > half_zero ? grad2.y : half_zero);
}
/**
* @brief init curand states in global memory
*
* @thread grid_dim * block*dim to suuport any size of states
* @param state persistant curand states
* @param seed seed to init states
* @return void
*/
__global__ void curand_init_kernel(curandStatePhilox4_32_10_t *state,
int seed) {
/* Each thread gets same seed, a different sequence
number, no offset */
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
}
void launch_curand_init(int total_count, int dim, cudaStream_t stream) {
cudaMalloc(&curandstate, total_count * sizeof(curandStatePhilox4_32_10_t));
int grid_dim = total_count >> 9;
curand_init_kernel<<<grid_dim, 512, 0, stream>>>(
curandstate, std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count());
}
/**
* @brief element-wise dropout, store dropped position in mask, it's not
* in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param out any size of float and __half
* @param in same with out
* @param mask uint8 type, same size with out
* @param seed seed to curand
* @return void
*/
__global__ void ls_dropout_kernel(const int total_count, const float ratio,
float *__restrict__ out,
const float *__restrict__ in,
uint8_t *__restrict__ mask, const int seed) {
const float scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 4 >= total_count) return;
curandStatePhilox4_32_10_t state;
curand_init(seed, i, 0, &state);
uint8_t m[4];
float4 *out4 = reinterpret_cast<float4 *>(out);
const float4 *data4 = reinterpret_cast<const float4 *>(in);
uint32_t *mask4 = reinterpret_cast<uint32_t *>(mask);
float4 rand = curand_uniform4(&state);
m[0] = (uint8_t)(rand.x > ratio);
m[1] = (uint8_t)(rand.y > ratio);
m[2] = (uint8_t)(rand.z > ratio);
m[3] = (uint8_t)(rand.w > ratio);
uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
mask4[i] = m4[0];
float4 input4 = data4[i];
float4 res4;
res4.x = input4.x * scale * m[0];
res4.y = input4.y * scale * m[1];
res4.z = input4.z * scale * m[2];
res4.w = input4.w * scale * m[3];
out4[i] = res4;
}
__global__ void ls_dropout_kernel(const int total_count, const float ratio,
__half *__restrict__ out,
const __half *__restrict__ in,
uint8_t *__restrict__ mask, const int seed) {
const float scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 8 >= total_count) return;
curandStatePhilox4_32_10_t state;
curand_init(seed, i, 0, &state);
const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
float4 *outs_float4 = reinterpret_cast<float4 *>(out);
uint64_t *mask8 = reinterpret_cast<uint64_t *>(mask);
uint8_t m[8];
float4 rand = curand_uniform4(&state);
m[0] = (uint8_t)(rand.x > ratio);
m[1] = (uint8_t)(rand.y > ratio);
m[2] = (uint8_t)(rand.z > ratio);
m[3] = (uint8_t)(rand.w > ratio);
rand = curand_uniform4(&state);
m[4] = (uint8_t)(rand.x > ratio);
m[5] = (uint8_t)(rand.y > ratio);
m[6] = (uint8_t)(rand.z > ratio);
m[7] = (uint8_t)(rand.w > ratio);
uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
mask8[i] = *m8;
float4 val_float4 = vals_float4[i];
float4 out_float4;
__half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
__half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
__half2 scale_mask_1 = __floats2half2_rn(scale * m[0], scale * m[1]);
__half2 scale_mask_2 = __floats2half2_rn(scale * m[2], scale * m[3]);
__half2 scale_mask_3 = __floats2half2_rn(scale * m[4], scale * m[5]);
__half2 scale_mask_4 = __floats2half2_rn(scale * m[6], scale * m[7]);
out_half2[0] = __hmul2(val_half2[0], scale_mask_1);
out_half2[1] = __hmul2(val_half2[1], scale_mask_2);
out_half2[2] = __hmul2(val_half2[2], scale_mask_3);
out_half2[3] = __hmul2(val_half2[3], scale_mask_4);
outs_float4[i] = out_float4;
}
/**
* @brief element-wise dropout backward with dropout mask, it's
* not in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param in any size of float and __half
* @param mask uint8 type, same size with in
* @return void
*/
__global__ void ls_dropout_bwd_kernel(const int total_count, const float ratio,
float *out, const float *in,
const uint8_t *__restrict__ mask) {
const float scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 4 >= total_count) return;
uint8_t m[4];
float4 *out4 = reinterpret_cast<float4 *>(out);
const float4 *in4 = reinterpret_cast<const float4 *>(in);
const uint32_t *mask4 = reinterpret_cast<const uint32_t *>(mask);
uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
m4[0] = mask4[i];
float4 input4 = in4[i];
float4 res4;
res4.x = input4.x * scale * static_cast<float>(m[0]);
res4.y = input4.y * scale * static_cast<float>(m[1]);
res4.z = input4.z * scale * static_cast<float>(m[2]);
res4.w = input4.w * scale * static_cast<float>(m[3]);
out4[i] = res4;
}
__global__ void ls_dropout_bwd_kernel(const int total_count, const float ratio,
__half *out, const __half *in,
const uint8_t *__restrict__ mask) {
const __half scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 8 >= total_count) return;
float4 *out4 = reinterpret_cast<float4 *>(out);
const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
const uint64_t *mask8 = reinterpret_cast<const uint64_t *>(mask);
uint8_t m[8];
uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
m8[0] = mask8[i];
float4 val_float4 = vals_float4[i];
float4 out_float4;
__half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
__half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
__half2 scale_mask_1 =
__halves2half2(scale * __float2half(m[0]), scale * __float2half(m[1]));
__half2 scale_mask_2 =
__halves2half2(scale * __float2half(m[2]), scale * __float2half(m[3]));
__half2 scale_mask_3 =
__halves2half2(scale * __float2half(m[4]), scale * __float2half(m[5]));
__half2 scale_mask_4 =
__halves2half2(scale * __float2half(m[6]), scale * __float2half(m[7]));
out_half2[0] = __hmul2(val_half2[0], scale_mask_1);
out_half2[1] = __hmul2(val_half2[1], scale_mask_2);
out_half2[2] = __hmul2(val_half2[2], scale_mask_3);
out_half2[3] = __hmul2(val_half2[3], scale_mask_4);
out4[i] = out_float4;
}
template <>
void launch_ls_dropout<float>(float *out, const float *vals, uint8_t *mask,
int total_count, float ratio, cudaStream_t stream,
bool backward) {
int grid_dim = total_count >> 12;
if (!backward) {
ls_dropout_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
total_count, ratio, out, vals, mask,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count());
} else {
ls_dropout_bwd_kernel<<<grid_dim + 1, 1024, 0, stream>>>(total_count, ratio,
out, vals, mask);
}
}
template <>
void launch_ls_dropout<__half>(__half *out, const __half *vals, uint8_t *mask,
int total_count, float ratio,
cudaStream_t stream, bool backward) {
int grid_dim = total_count >> 13;
if (!backward) {
ls_dropout_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
total_count, ratio, out, vals, mask,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count());
} else {
ls_dropout_bwd_kernel<<<grid_dim + 1, 1024, 0, stream>>>(total_count, ratio,
out, vals, mask);
}
}
/**
* @brief fused bias, dropout, and residual at the end of Attention and FFN,
* store dropped position in mask, it's not in-place
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @param total_count total elements
* @param ratio drop ratio
* @param out [batch_size, seq_len, hidden_size], float and __half
* @param in [batch_size, seq_len, hidden_size], float and __half
* @param mask [batch_size, seq_len, hidden_size], uint8 type
* @param bias [hidden_size], ffn bias
* @param residual [batch_size, seq_len, hidden_size], float and __half
* @param seed seed to curand
* @param hidden_size hidden size
* @return void
*/
__global__ void ls_dropout_res_bias_kernel(
const int total_count, const float ratio, float *__restrict__ out,
const float *__restrict__ in, uint8_t *__restrict__ mask,
const float *__restrict__ bias, const float *__restrict__ residual,
const int seed, const int hidden_size) {
const float scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 4 >= total_count) return;
curandStatePhilox4_32_10_t state;
curand_init(seed, i, 0, &state);
uint8_t m[4];
float4 *out4 = reinterpret_cast<float4 *>(out);
const float4 *data4 = reinterpret_cast<const float4 *>(in);
const float4 *residual4 = reinterpret_cast<const float4 *>(residual);
const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
uint32_t *mask4 = reinterpret_cast<uint32_t *>(mask);
float4 rand = curand_uniform4(&state);
m[0] = static_cast<uint8_t>(rand.x > ratio);
m[1] = static_cast<uint8_t>(rand.y > ratio);
m[2] = static_cast<uint8_t>(rand.z > ratio);
m[3] = static_cast<uint8_t>(rand.w > ratio);
int bias_i = i % (hidden_size >> 2);
uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
mask4[i] = m4[0];
const float4 input4 = data4[i];
const float4 b4 = __ldg(&bias4[bias_i]);
const float4 res4 = residual4[i];
float4 output4;
output4.x = (input4.x + b4.x) * scale * m[0] + res4.x;
output4.y = (input4.y + b4.y) * scale * m[1] + res4.y;
output4.z = (input4.z + b4.z) * scale * m[2] + res4.z;
output4.w = (input4.w + b4.w) * scale * m[3] + res4.w;
out4[i] = output4;
}
__global__ void ls_dropout_res_bias_kernel(
const int total_count, const float ratio, __half *__restrict__ out,
const __half *__restrict__ in, uint8_t *__restrict__ mask,
const __half *__restrict__ bias, const __half *__restrict__ residual,
const int seed, const int hidden_size) {
const __half scale = 1. / (1. - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 8 >= total_count) return;
curandStatePhilox4_32_10_t state;
curand_init(seed, i, 0, &state);
const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
float4 *outs_float4 = reinterpret_cast<float4 *>(out);
const float4 *residual4 = reinterpret_cast<const float4 *>(residual);
const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
uint64_t *mask8 = reinterpret_cast<uint64_t *>(mask);
uint8_t m[8];
float4 rand = curand_uniform4(&state);
m[0] = static_cast<uint8_t>(rand.x > ratio);
m[1] = static_cast<uint8_t>(rand.y > ratio);
m[2] = static_cast<uint8_t>(rand.z > ratio);
m[3] = static_cast<uint8_t>(rand.w > ratio);
rand = curand_uniform4(&state);
m[4] = static_cast<uint8_t>(rand.x > ratio);
m[5] = static_cast<uint8_t>(rand.y > ratio);
m[6] = static_cast<uint8_t>(rand.z > ratio);
m[7] = static_cast<uint8_t>(rand.w > ratio);
uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
mask8[i] = m8[0];
int bias_i = i % (hidden_size >> 3);
float4 val_float4 = vals_float4[i];
const float4 b4 = __ldg(&bias4[bias_i]);
const float4 res4 = residual4[i];
float4 out_float4;
__half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
__half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
const __half2 *b_half2 = reinterpret_cast<const __half2 *>(&b4);
const __half2 *res_half2 = reinterpret_cast<const __half2 *>(&res4);
__half2 scale_mask_1 =
__halves2half2(scale * __float2half(m[0]), scale * __float2half(m[1]));
__half2 scale_mask_2 =
__halves2half2(scale * __float2half(m[2]), scale * __float2half(m[3]));
__half2 scale_mask_3 =
__halves2half2(scale * __float2half(m[4]), scale * __float2half(m[5]));
__half2 scale_mask_4 =
__halves2half2(scale * __float2half(m[6]), scale * __float2half(m[7]));
out_half2[0] =
__hfma2(__hadd2(val_half2[0], b_half2[0]), scale_mask_1, res_half2[0]);
out_half2[1] =
__hfma2(__hadd2(val_half2[1], b_half2[1]), scale_mask_2, res_half2[1]);
out_half2[2] =
__hfma2(__hadd2(val_half2[2], b_half2[2]), scale_mask_3, res_half2[2]);
out_half2[3] =
__hfma2(__hadd2(val_half2[3], b_half2[3]), scale_mask_4, res_half2[3]);
outs_float4[i] = out_float4;
}
template <>
void launch_ls_dropout_res_bias<float>(float *out, const float *vals,
uint8_t *mask, const float *bias,
const float *residual, int total_count,
int dim, float ratio,
cudaStream_t stream) {
int grid_dim = total_count >> 12;
ls_dropout_res_bias_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
total_count, ratio, out, vals, mask, bias, residual,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count(),
dim);
}
template <>
void launch_ls_dropout_res_bias<__half>(__half *out, const __half *vals,
uint8_t *mask, const __half *bias,
const __half *residual, int total_count,
int dim, float ratio,
cudaStream_t stream) {
int grid_dim = total_count >> 13;
ls_dropout_res_bias_kernel<<<grid_dim + 1, 1024, 0, stream>>>(
total_count, ratio, out, vals, mask, bias, residual,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count(),
dim);
}
/**
* @brief fused bias and dropout backward at the end of Attention and FFN
*
* @thread
* gridDim.x = hidden_size / 8
* blockDim.x = 8
* blockDim.y = 1024 / 8 = 128
*
* @param row_size batch_size * seq_len
* @param ratio dropout ratio
* @param in_grad [batch_size, seq_len, hidden_size], input grad
* @param bias_grad [hidden_size], bias grad
* @param out_grad [batch_size, seq_len, hidden_size], output grad
* @param mask [batch_size, seq_len, hidden_size], dropout mask
* @param hidden_size
* @return void
*/
__global__ void ls_dropout_bias_bwd_kernel(
const int row_size, const float ratio, float *__restrict__ in_grad,
float *__restrict__ bias_grad, const float *__restrict__ out_grad,
const uint8_t *__restrict__ mask, const int hidden_size) {
const float scale = 1.f / (1.f - ratio);
// every block generate 8 bias result
__shared__ float tile[8][129];
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
int col_idx = flat_2dim(blockIdx.x, threadIdx.x, 8);
int stride = hidden_size * 128;
float local_sum = 0;
int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
for (int r = threadIdx.y; r < row_size; r += 128) {
float val = out_grad[idx];
val *= scale * static_cast<float>(mask[idx]);
local_sum += val;
in_grad[idx] = val;
idx += stride;
}
tile[threadIdx.x][threadIdx.y] = local_sum;
__syncthreads();
float sum = 0;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int x = tid >> 7;
int y = tid & (127);
if (y < 32) {
#pragma unroll
for (int i = 0; i < 4; i++) {
sum += tile[x][y + i * 32];
}
}
__syncthreads();
for (int i = 1; i < 32; i <<= 1) sum += g.shfl_down(sum, i);
if (y == 0) tile[0][x] = sum;
__syncthreads();
if (threadIdx.x < 8) {
int pos = flat_2dim(blockIdx.x, threadIdx.x, 8);
bias_grad[pos] = tile[0][threadIdx.x];
}
}
__global__ void ls_dropout_bias_bwd_kernel(
const int row_size, const float ratio, __half *__restrict__ in_grad,
__half *__restrict__ bias_grad, const __half *__restrict__ out_grad,
const uint8_t *__restrict__ mask, const int hidden_size) {
const __half2 scale = __float2half2_rn(1.f / (1.f - ratio));
__shared__ __half2 tile[8][129];
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
__half2 *in_grad2 = reinterpret_cast<__half2 *>(in_grad);
const __half2 *out_grad2 = reinterpret_cast<const __half2 *>(out_grad);
__half2 *bias_grad2 = reinterpret_cast<__half2 *>(bias_grad);
int col_idx = flat_2dim(blockIdx.x, threadIdx.x, 8);
int stride = hidden_size * 128;
__half2 local_sum = __float2half2_rn(0.f);
int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
for (int r = threadIdx.y; r < row_size; r += 128) {
__half2 val = out_grad2[idx];
__half2 m2 = __floats2half2_rn(mask[2 * idx], mask[2 * idx + 1]);
val *= scale * m2;
local_sum += val;
in_grad2[idx] = val;
idx += stride;
}
tile[threadIdx.x][threadIdx.y] = local_sum;
__syncthreads();
__half2 sum = __float2half2_rn(0.f);
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int x = tid >> 7;
int y = tid & (127);
if (y < 32) {
#pragma unroll
for (int i = 0; i < 4; i++) {
sum += tile[x][y + i * 32];
}
}
__syncthreads();
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
if (y == 0) tile[0][x] = sum;
__syncthreads();
if (threadIdx.x < 8) {
int pos = flat_2dim(blockIdx.x, threadIdx.x, 8);
bias_grad2[pos] = tile[0][threadIdx.x];
}
}
template <typename T>
void launch_ls_dropout_bias_bwd(T *in_grad, T *bias_grad, const T *out_grad,
const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream) {
dim3 grid_dim((dim - 1) / 8 + 1);
dim3 block_dim(8, 128);
ls_dropout_bias_bwd_kernel<<<grid_dim, block_dim, 0, stream>>>(
row_size, ratio, in_grad, bias_grad, out_grad, mask, dim);
}
template <>
void launch_ls_dropout_bias_bwd(__half *in_grad, __half *bias_grad,
const __half *out_grad, const uint8_t *mask,
int row_size, int dim, float ratio,
cudaStream_t stream) {
dim >>= 1;
dim3 grid_dim((dim - 1) / 8 + 1);
dim3 block_dim(8, 128);
ls_dropout_bias_bwd_kernel<<<grid_dim, block_dim, 0, stream>>>(
row_size, ratio, in_grad, bias_grad, out_grad, mask, dim);
}
template void launch_ls_dropout_bias_bwd(float *in_grad, float *bias_grad,
const float *out_grad,
const uint8_t *mask, int row_size,
int dim, float ratio,
cudaStream_t stream);
/**
* @brief fused bias, activation, and dropout at the end of first ffn
*
* @thread
* gridDim.x = hidden_size / 8
* blockDim.x = 8
* blockDim.y = 1024 / 8 = 128
*
* @tparam act_type activation function, like kRelu, kGelu
* @param total_count total elements
* @param ratio drop ratio
* @param out [batch_size, seq_len, hidden_size], float and __half
* @param in [batch_size, seq_len, hidden_size], float and __half
* @param mask [batch_size, seq_len, hidden_size], uint8 type
* @param bias [hidden_size], ffn bias
* @param seed seed to curand
* @param hidden_size
* @return void
*/
template <ActivationType act_type>
__global__ void ls_dropout_act_bias_kernel(
const int total_count, const float ratio, float *__restrict__ out,
const float *__restrict__ in, uint8_t *__restrict__ mask,
const float *__restrict__ bias, const int seed, const int hidden_size) {
const float scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 4 >= total_count) return;
curandStatePhilox4_32_10_t state;
curand_init(seed, i, 0, &state);
uint8_t m[4];
float4 *out4 = reinterpret_cast<float4 *>(out);
const float4 *data4 = reinterpret_cast<const float4 *>(in);
const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
uint32_t *mask4 = reinterpret_cast<uint32_t *>(mask);
float4 rand = curand_uniform4(&state);
m[0] = (uint8_t)(rand.x > ratio);
m[1] = (uint8_t)(rand.y > ratio);
m[2] = (uint8_t)(rand.z > ratio);
m[3] = (uint8_t)(rand.w > ratio);
int bias_i = i % (hidden_size >> 2);
uint32_t *m4 = reinterpret_cast<uint32_t *>(m);
mask4[i] = m4[0];
const float4 input4 = data4[i];
const float4 b4 = __ldg(&bias4[bias_i]);
float4 output4;
output4.x =
activation_kernel<act_type, float>(input4.x + b4.x) * scale * m[0];
output4.y =
activation_kernel<act_type, float>(input4.y + b4.y) * scale * m[1];
output4.z =
activation_kernel<act_type, float>(input4.z + b4.z) * scale * m[2];
output4.w =
activation_kernel<act_type, float>(input4.w + b4.w) * scale * m[3];
out4[i] = output4;
}
template <ActivationType act_type>
__global__ void ls_dropout_act_bias_kernel(
const int total_count, const float ratio, __half *__restrict__ out,
const __half *__restrict__ in, uint8_t *__restrict__ mask,
const __half *__restrict__ bias, const int seed, const int hidden_size) {
const float scale = 1.f / (1.f - ratio);
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i * 8 >= total_count) return;
curandStatePhilox4_32_10_t state;
curand_init(seed, i, 0, &state);
const float4 *vals_float4 = reinterpret_cast<const float4 *>(in);
float4 *outs_float4 = reinterpret_cast<float4 *>(out);
const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
uint64_t *mask8 = reinterpret_cast<uint64_t *>(mask);
uint8_t m[8];
float4 rand = curand_uniform4(&state);
m[0] = (uint8_t)(rand.x > ratio);
m[1] = (uint8_t)(rand.y > ratio);
m[2] = (uint8_t)(rand.z > ratio);
m[3] = (uint8_t)(rand.w > ratio);
rand = curand_uniform4(&state);
m[4] = (uint8_t)(rand.x > ratio);
m[5] = (uint8_t)(rand.y > ratio);
m[6] = (uint8_t)(rand.z > ratio);
m[7] = (uint8_t)(rand.w > ratio);
uint64_t *m8 = reinterpret_cast<uint64_t *>(m);
mask8[i] = *m8;
int bias_i = i % (hidden_size >> 3);
float4 val_float4 = vals_float4[i];
const float4 b4 = __ldg(&bias4[bias_i]);
float4 out_float4;
__half2 *val_half2 = reinterpret_cast<__half2 *>(&val_float4);
__half2 *out_half2 = reinterpret_cast<__half2 *>(&out_float4);
const __half2 *b_half2 = reinterpret_cast<const __half2 *>(&b4);
__half2 scale_mask_1 = __floats2half2_rn(scale * m[0], scale * m[1]);
__half2 scale_mask_2 = __floats2half2_rn(scale * m[2], scale * m[3]);
__half2 scale_mask_3 = __floats2half2_rn(scale * m[4], scale * m[5]);
__half2 scale_mask_4 = __floats2half2_rn(scale * m[6], scale * m[7]);
out_half2[0] = __hmul2(
activation_kernel<act_type, __half2>(__hadd2(val_half2[0], b_half2[0])),
scale_mask_1);
out_half2[1] = __hmul2(
activation_kernel<act_type, __half2>(__hadd2(val_half2[1], b_half2[1])),
scale_mask_2);
out_half2[2] = __hmul2(
activation_kernel<act_type, __half2>(__hadd2(val_half2[2], b_half2[2])),
scale_mask_3);
out_half2[3] = __hmul2(
activation_kernel<act_type, __half2>(__hadd2(val_half2[3], b_half2[3])),
scale_mask_4);
outs_float4[i] = out_float4;
}
template <>
void launch_ls_dropout_act_bias<ActivationType::kGelu, float>(
float *out, const float *vals, uint8_t *mask, const float *bias,
int total_count, int dim, float ratio, cudaStream_t stream) {
int grid_dim = total_count >> 10;
ls_dropout_act_bias_kernel<ActivationType::kGelu>
<<<grid_dim + 1, 256, 0, stream>>>(
total_count, ratio, out, vals, mask, bias,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count(),
dim);
}
template <>
void launch_ls_dropout_act_bias<ActivationType::kGelu, __half>(
__half *out, const __half *vals, uint8_t *mask, const __half *bias,
int total_count, int dim, float ratio, cudaStream_t stream) {
int grid_dim = total_count >> 11;
ls_dropout_act_bias_kernel<ActivationType::kGelu>
<<<grid_dim + 1, 256, 0, stream>>>(
total_count, ratio, out, vals, mask, bias,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count(),
dim);
}
template <>
void launch_ls_dropout_act_bias<ActivationType::kRelu, float>(
float *out, const float *vals, uint8_t *mask, const float *bias,
int total_count, int dim, float ratio, cudaStream_t stream) {
int grid_dim = total_count >> 10;
ls_dropout_act_bias_kernel<ActivationType::kRelu>
<<<grid_dim + 1, 256, 0, stream>>>(
total_count, ratio, out, vals, mask, bias,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count(),
dim);
}
template <>
void launch_ls_dropout_act_bias<ActivationType::kRelu, __half>(
__half *out, const __half *vals, uint8_t *mask, const __half *bias,
int total_count, int dim, float ratio, cudaStream_t stream) {
int grid_dim = total_count >> 11;
ls_dropout_act_bias_kernel<ActivationType::kRelu>
<<<grid_dim + 1, 256, 0, stream>>>(
total_count, ratio, out, vals, mask, bias,
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count(),
dim);
}
/**
* @brief fused bias, activation, and dropout backward
*
* @thread
* gridDim.x = total_count / 1024
* blockDim.x = 1024
*
* @tparam act_type kRelu
* @param row_size batch_size * seq_len
* @param ratio dropout ratio
* @param in_grad [batch_size, seq_len, hidden_size], input grad
* @param bias_grad [hidden_size], bias grad
* @param out_grad [batch_size, seq_len, hidden_size], output grad
* @param mask [batch_size, seq_len, hidden_size], dropout mask
* @param hidden_size
* @return void
*/
template <ActivationType act_type, typename T>
__global__ void ls_dropout_act_bias_bwd_kernel(
const int row_size, const float ratio, T *in_grad,
T *__restrict__ bias_grad, const T *__restrict__ input,
const T *__restrict__ bias, const T *out_grad,
const uint8_t *__restrict__ mask, const int hidden_size) {
const float scale = 1.f / (1.f - ratio);
__shared__ float tile[WARP_SIZE][WARP_SIZE + 1];
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
int col_idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
int stride = hidden_size * WARP_SIZE;
float local_sum = 0;
int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
if (col_idx < hidden_size) {
for (int r = threadIdx.y; r < row_size; r += WARP_SIZE) {
float val = out_grad[idx];
float in = input[idx];
float b = bias[idx % hidden_size];
val = activation_bwd_kernel<act_type, float>(
val * scale * static_cast<float>(mask[idx]), in + b);
local_sum += val;
in_grad[idx] = val;
idx += stride;
}
}
tile[threadIdx.x][threadIdx.y] = local_sum;
__syncthreads();
float sum = tile[threadIdx.y][threadIdx.x];
__syncthreads();
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
if (threadIdx.x == 0) tile[0][threadIdx.y] = sum;
__syncthreads();
if (threadIdx.y == 0) {
int pos = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
bias_grad[pos] = tile[0][threadIdx.x];
}
}
// @brief fused bias, activation, and dropout backward
// It is deprecated for precision reason. Keep it for future optimization.
//
// template <ActivationType act_type>
// __global__ void ls_dropout_act_bias_bwd_kernel(
// const int row_size, const float ratio, __half * in_grad,
// __half *__restrict__ bias_grad, const __half *__restrict__ input, const
// __half *__restrict__ bias, const __half * out_grad, const uint8_t
// *__restrict__ mask, const int hidden_size) {
// const __half2 scale = __float2half2_rn(1.f / (1.f - ratio));
// __shared__ __half2 tile[WARP_SIZE][WARP_SIZE + 1];
// cg::thread_block b = cg::this_thread_block();
// cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
// __half2 *in_grad2 = reinterpret_cast<__half2 *>(in_grad);
// __half2 *bias_grad2 = reinterpret_cast<__half2 *>(bias_grad);
// const __half2 *out_grad2 = reinterpret_cast<const __half2 *>(out_grad);
// const __half2 *input2 = reinterpret_cast<const __half2 *>(input);
// const __half2 *bias2 = reinterpret_cast<const __half2 *>(bias);
// int col_idx = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
// int stride = hidden_size * WARP_SIZE;
// __half2 local_sum = __float2half2_rn(0.f);
// int idx = flat_2dim(threadIdx.y, col_idx, hidden_size);
// if (col_idx < hidden_size) {
// for (int r = threadIdx.y; r < row_size; r += WARP_SIZE) {
// __half2 val = out_grad2[idx];
// __half2 in2 = input2[idx];
// __half2 b2 = bias2[idx % hidden_size ];
// __half2 m2 = __floats2half2_rn(mask[2 * idx], mask[2 * idx + 1]);
// val = activation_bwd_kernel<ActivationType::kRelu, __half2>(val * scale
// *
// m2,
// in2+b2);
// local_sum += val;
// in_grad2[idx] = val;
// idx += stride;
// }
// }
// tile[threadIdx.x][threadIdx.y] = local_sum;
// __syncthreads();
// __half2 sum = tile[threadIdx.y][threadIdx.x];
// __syncthreads();
// for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_down(sum, i);
// if (threadIdx.x == 0) tile[0][threadIdx.y] = sum;
// __syncthreads();
// if (threadIdx.y == 0) {
// int pos = flat_2dim(blockIdx.x, threadIdx.x, WARP_SIZE);
// bias_grad2[pos] = tile[0][threadIdx.x];
// }
// }
template <ActivationType act_type, typename T>
void launch_ls_dropout_act_bias_bwd(T *in_grad, T *bias_grad, const T *input,
const T *bias, const T *out_grad,
const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream) {
dim3 grid_dim((dim - 1) / WARP_SIZE + 1);
dim3 block_dim(WARP_SIZE, WARP_SIZE);
ls_dropout_act_bias_bwd_kernel<act_type><<<grid_dim, block_dim, 0, stream>>>(
row_size, ratio, in_grad, bias_grad, input, bias, out_grad, mask, dim);
}
// template <>
// void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, __half>(
// __half *in_grad, __half *bias_grad,const __half *input, const __half
// *bias, const __half *out_grad, const uint8_t *mask, int row_size, int
// dim, float ratio, cudaStream_t stream) {
// dim >>= 1;
// dim3 grid_dim((dim - 1) / WARP_SIZE + 1);
// dim3 block_dim(WARP_SIZE, WARP_SIZE);
// ls_dropout_act_bias_bwd_kernel<ActivationType::kRelu>
// <<<grid_dim, block_dim, 0, stream>>>(row_size, ratio, in_grad,
// bias_grad,
// input, bias,out_grad, mask, dim);
// }
template void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, float>(
float *in_grad, float *bias_grad, const float *input, const float *bias,
const float *out_grad, const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream);
template void launch_ls_dropout_act_bias_bwd<ActivationType::kRelu, __half>(
__half *in_grad, __half *bias_grad, const __half *input, const __half *bias,
const __half *out_grad, const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream);
template void launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, float>(
float *in_grad, float *bias_grad, const float *input, const float *bias,
const float *out_grad, const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream);
template void launch_ls_dropout_act_bias_bwd<ActivationType::kGelu, __half>(
__half *in_grad, __half *bias_grad, const __half *input, const __half *bias,
const __half *out_grad, const uint8_t *mask, int row_size, int dim,
float ratio, cudaStream_t stream);