diff --git a/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu b/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu index 64f0fc2c2..98af433fe 100644 --- a/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu +++ b/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu @@ -1,3 +1,4 @@ +#include #include #include @@ -6,8 +7,6 @@ #include "block_reduce.h" #include "kernels.h" -#include - namespace cg = cooperative_groups; const float EPSILON = 1e-8f; @@ -120,7 +119,7 @@ __global__ void ker_attn_softmax(T *inp, const T *attn_mask, int from_len, BlockStore(ts_store).Store(inp + (token_id + i) * to_len, inp_val[i], to_len); } - } // blockIdx.x + } // blockIdx.x } template @@ -198,7 +197,7 @@ __global__ void ker_attn_softmax_lt32(T *inp, const T *attn_mask, int from_len, BlockStore(ts_store).Store(inp + (token_id + i) * to_len, inp_val[i], to_len); } - } // blockIdx.x + } // blockIdx.x } /* @@ -304,8 +303,7 @@ __global__ void ker_attn_softmax_bw(T *grad, const T *inp, int softmax_length) { cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile g = cg::tiled_partition(b); - for (int i = 1; i < WARP_SIZE; i <<= 1) - sum += g.shfl_xor(sum, i); + for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i); #pragma unroll for (int i = 0; i < ITERATIONS; ++i) {