diff --git a/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu b/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu index 86579201b..64f0fc2c2 100644 --- a/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu +++ b/colossalai/kernel/cuda_native/csrc/kernels/softmax_kernels.cu @@ -120,7 +120,7 @@ __global__ void ker_attn_softmax(T *inp, const T *attn_mask, int from_len, BlockStore(ts_store).Store(inp + (token_id + i) * to_len, inp_val[i], to_len); } - } // blockIdx.x + } // blockIdx.x } template @@ -198,7 +198,7 @@ __global__ void ker_attn_softmax_lt32(T *inp, const T *attn_mask, int from_len, BlockStore(ts_store).Store(inp + (token_id + i) * to_len, inp_val[i], to_len); } - } // blockIdx.x + } // blockIdx.x } /* @@ -304,7 +304,8 @@ __global__ void ker_attn_softmax_bw(T *grad, const T *inp, int softmax_length) { cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile g = cg::tiled_partition(b); - for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i); + for (int i = 1; i < WARP_SIZE; i <<= 1) + sum += g.shfl_xor(sum, i); #pragma unroll for (int i = 0; i < ITERATIONS; ++i) {