mirror of https://github.com/hpcaitech/ColossalAI
[NFC] polish colossalai/kernel/cuda_native/csrc/multi_tensor_scale_kernel.cu code style (#977)
parent
52705ec5c5
commit
b67eebd20f
|
@ -15,7 +15,8 @@
|
||||||
#define BLOCK_SIZE 512
|
#define BLOCK_SIZE 512
|
||||||
#define ILP 4
|
#define ILP 4
|
||||||
|
|
||||||
template <typename T> __device__ __forceinline__ bool is_aligned(T *p) {
|
template <typename T>
|
||||||
|
__device__ __forceinline__ bool is_aligned(T *p) {
|
||||||
return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
|
return ((uint64_t)p) % (ILP * sizeof(T)) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +28,8 @@ __device__ __forceinline__ void load_store(T *dst, T *src, int dst_offset,
|
||||||
((LT *)dst)[dst_offset] = ((LT *)src)[src_offset];
|
((LT *)dst)[dst_offset] = ((LT *)src)[src_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename in_t, typename out_t> struct ScaleFunctor {
|
template <typename in_t, typename out_t>
|
||||||
|
struct ScaleFunctor {
|
||||||
__device__ __forceinline__ void operator()(int chunk_size,
|
__device__ __forceinline__ void operator()(int chunk_size,
|
||||||
volatile int *noop_gmem,
|
volatile int *noop_gmem,
|
||||||
TensorListMetadata<2> &tl,
|
TensorListMetadata<2> &tl,
|
||||||
|
@ -76,8 +78,7 @@ template <typename in_t, typename out_t> struct ScaleFunctor {
|
||||||
for (int ii = 0; ii < ILP; ii++) {
|
for (int ii = 0; ii < ILP; ii++) {
|
||||||
r_in[ii] = 0;
|
r_in[ii] = 0;
|
||||||
int i = i_start + threadIdx.x + ii * blockDim.x;
|
int i = i_start + threadIdx.x + ii * blockDim.x;
|
||||||
if (i < n && i < chunk_size)
|
if (i < n && i < chunk_size) r_in[ii] = in[i];
|
||||||
r_in[ii] = in[i];
|
|
||||||
}
|
}
|
||||||
// note for clarification to future michael:
|
// note for clarification to future michael:
|
||||||
// From a pure memory dependency perspective, there's likely no point
|
// From a pure memory dependency perspective, there's likely no point
|
||||||
|
@ -93,14 +94,13 @@ template <typename in_t, typename out_t> struct ScaleFunctor {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int ii = 0; ii < ILP; ii++) {
|
for (int ii = 0; ii < ILP; ii++) {
|
||||||
int i = i_start + threadIdx.x + ii * blockDim.x;
|
int i = i_start + threadIdx.x + ii * blockDim.x;
|
||||||
if (i < n && i < chunk_size)
|
if (i < n && i < chunk_size) out[i] = r_out[ii];
|
||||||
out[i] = r_out[ii];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!finite)
|
if (!finite)
|
||||||
*noop_gmem =
|
*noop_gmem =
|
||||||
1; // Blindly fire off a write. These will race but that's ok.
|
1; // Blindly fire off a write. These will race but that's ok.
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue