[NFC] polish colossalai/kernel/cuda_native/csrc/multi_tensor_scale_kernel.cu code style (#977)

pull/997/head
Jie Zhu 2022-05-16 12:33:06 +08:00 committed by binmakeswell
parent 52705ec5c5
commit b67eebd20f
1 changed files with 7 additions and 7 deletions

View File

@ -15,7 +15,8 @@
#define BLOCK_SIZE 512 #define BLOCK_SIZE 512
#define ILP 4 #define ILP 4
template <typename T> __device__ __forceinline__ bool is_aligned(T *p) { template <typename T>
__device__ __forceinline__ bool is_aligned(T *p) {
return ((uint64_t)p) % (ILP * sizeof(T)) == 0; return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
} }
@ -27,7 +28,8 @@ __device__ __forceinline__ void load_store(T *dst, T *src, int dst_offset,
((LT *)dst)[dst_offset] = ((LT *)src)[src_offset]; ((LT *)dst)[dst_offset] = ((LT *)src)[src_offset];
} }
template <typename in_t, typename out_t> struct ScaleFunctor { template <typename in_t, typename out_t>
struct ScaleFunctor {
__device__ __forceinline__ void operator()(int chunk_size, __device__ __forceinline__ void operator()(int chunk_size,
volatile int *noop_gmem, volatile int *noop_gmem,
TensorListMetadata<2> &tl, TensorListMetadata<2> &tl,
@ -76,8 +78,7 @@ template <typename in_t, typename out_t> struct ScaleFunctor {
for (int ii = 0; ii < ILP; ii++) { for (int ii = 0; ii < ILP; ii++) {
r_in[ii] = 0; r_in[ii] = 0;
int i = i_start + threadIdx.x + ii * blockDim.x; int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) if (i < n && i < chunk_size) r_in[ii] = in[i];
r_in[ii] = in[i];
} }
// note for clarification to future michael: // note for clarification to future michael:
// From a pure memory dependency perspective, there's likely no point // From a pure memory dependency perspective, there's likely no point
@ -93,14 +94,13 @@ template <typename in_t, typename out_t> struct ScaleFunctor {
#pragma unroll #pragma unroll
for (int ii = 0; ii < ILP; ii++) { for (int ii = 0; ii < ILP; ii++) {
int i = i_start + threadIdx.x + ii * blockDim.x; int i = i_start + threadIdx.x + ii * blockDim.x;
if (i < n && i < chunk_size) if (i < n && i < chunk_size) out[i] = r_out[ii];
out[i] = r_out[ii];
} }
} }
} }
if (!finite) if (!finite)
*noop_gmem = *noop_gmem =
1; // Blindly fire off a write. These will race but that's ok. 1; // Blindly fire off a write. These will race but that's ok.
} }
}; };