Browse Source

[Inference] Delete duplicated copy_vector (#5716)

pull/5723/head
傅剑寒 6 months ago committed by GitHub
parent
commit
121d7ad629
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 1
      extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu
  2. 1
      extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu
  3. 6
      extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu
  4. 22
      extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu
  5. 26
      extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu
  6. 19
      extensions/csrc/kernel/cuda/utils/vec_copy.h

1
extensions/csrc/kernel/cuda/decode_kv_cache_memcpy_kernel.cu

@ -5,7 +5,6 @@
#include "funcs/cast_functor.h"
#include "common/micros.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::get_vec_size;
using colossalAI::cuda::utils::copy;
using colossalAI::funcs::CastFunctor;

1
extensions/csrc/kernel/cuda/fused_rotary_emb_and_cache_kernel.cu

@ -8,7 +8,6 @@
#include "funcs/cast_functor.h"
#include "funcs/binary_functor.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::get_vec_size;
using colossalAI::cuda::utils::copy;
using colossalAI::funcs::CastFunctor;

6
extensions/csrc/kernel/cuda/get_cos_and_sin_kernel.cu

@ -4,7 +4,7 @@
#include "utils/vec_copy.h"
#include "common/micros.h"
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::copy;
using colossalAI::cuda::utils::get_vec_size;
@ -23,8 +23,8 @@ __device__ void apply_cos_and_sin_memcopy(
int begin_id = threadIdx.x * VecSize;
for (; begin_id <= head_dim - VecSize; begin_id += blockDim.x){
copy_vector<scalar_t, VecSize>(cos + dest_offset_id + begin_id, cos_cache_ptr + src_offset_id + begin_id);
copy_vector<scalar_t, VecSize>(sin + dest_offset_id + begin_id, sin_cache_ptr + src_offset_id + begin_id);
copy<scalar_t, VecSize>(cos_cache_ptr + src_offset_id + begin_id, cos + dest_offset_id + begin_id);
copy<scalar_t, VecSize>(sin_cache_ptr + src_offset_id + begin_id, sin + dest_offset_id + begin_id);
}
if (!Aligned) {

22
extensions/csrc/kernel/cuda/scaled_masked_softmax_kernel.cu

@ -23,7 +23,7 @@ using colossalAI::funcs::UnaryOpFunctor;
using colossalAI::funcs::UnaryOpType;
using colossalAI::funcs::warp_reduce;
using colossalAI::funcs::ReduceType;
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::copy;
/*
@ -87,8 +87,8 @@ __global__ void scaled_masked_softmax_warp_forward(
if (element_index < batch_element_count) {
int itr_idx = i * element_count + it * WARP_SIZE;
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
copy<input_t, ELEMENTS_PER_LDG_STG>(src + itr_idx, temp_data);
copy<uint8_t, ELEMENTS_PER_LDG_STG>(mask + itr_idx, temp_mask);
#pragma unroll
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@ -144,8 +144,8 @@ __global__ void scaled_masked_softmax_warp_forward(
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
out[element] = elements[i][it + element] / sum[i];
}
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
dst + i * element_count + it * WARP_SIZE, out);
copy<output_t, ELEMENTS_PER_LDG_STG>(
out, dst + i * element_count + it * WARP_SIZE);
} else {
break;
}
@ -200,10 +200,10 @@ __global__ void scaled_masked_softmax_warp_backward(
for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
if (element_index < batch_element_count) {
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
temp_grad, grad + i * element_count + it * WARP_SIZE);
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
temp_output, output + i * element_count + it * WARP_SIZE);
copy<input_t, ELEMENTS_PER_LDG_STG>(
grad + i * element_count + it * WARP_SIZE, temp_grad);
copy<input_t, ELEMENTS_PER_LDG_STG>(
output + i * element_count + it * WARP_SIZE, temp_output);
#pragma unroll
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@ -245,8 +245,8 @@ __global__ void scaled_masked_softmax_warp_backward(
(output_t)(scale * (grad_reg[i][it + element] -
output_reg[i][it + element] * sum[i]));
}
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
gradInput + i * element_count + it * WARP_SIZE, out);
copy<output_t, ELEMENTS_PER_LDG_STG>(
out, gradInput + i * element_count + it * WARP_SIZE);
}
}
}

26
extensions/csrc/kernel/cuda/scaled_upper_triang_masked_softmax_kernel.cu

@ -23,8 +23,8 @@ using colossalAI::funcs::UnaryOpFunctor;
using colossalAI::funcs::UnaryOpType;
using colossalAI::funcs::warp_reduce;
using colossalAI::funcs::ReduceType;
using colossalAI::cuda::utils::copy_vector;
using colossalAI::cuda::utils::copy_zero_vector;
using colossalAI::cuda::utils::copy;
using colossalAI::cuda::utils::copy_zero;
/*
* Extended softmax (from native aten pytorch) with following additional
@ -75,8 +75,8 @@ __global__ void scaled_upper_triang_masked_softmax_warp_forward(
int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
if (element_index < batch_element_count) {
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
temp_data, src + i * element_count * stride + it * WARP_SIZE);
copy<input_t, ELEMENTS_PER_LDG_STG>(
src + i * element_count * stride + it * WARP_SIZE, temp_data);
#pragma unroll
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@ -140,10 +140,10 @@ __global__ void scaled_upper_triang_masked_softmax_warp_forward(
out[element] = 0;
}
}
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
dst + i * element_count * stride + it * WARP_SIZE, out);
copy<output_t, ELEMENTS_PER_LDG_STG>(
out, dst + i * element_count * stride + it * WARP_SIZE);
} else if (element_index < element_count) {
copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(
copy_zero<output_t, ELEMENTS_PER_LDG_STG>(
dst + i * element_count * stride + it * WARP_SIZE);
} else {
break;
@ -199,10 +199,10 @@ __global__ void scaled_upper_triang_masked_softmax_warp_backward(
for (int it = 0; it < WARP_ITERATIONS; it += ELEMENTS_PER_LDG_STG) {
int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
if (element_index < batch_element_count) {
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
copy_vector<input_t, ELEMENTS_PER_LDG_STG>(
temp_output, output + i * element_count * stride + it * WARP_SIZE);
copy<input_t, ELEMENTS_PER_LDG_STG>(
grad + i * element_count * stride + it * WARP_SIZE, temp_grad);
copy<input_t, ELEMENTS_PER_LDG_STG>(
output + i * element_count * stride + it * WARP_SIZE, temp_output);
#pragma unroll
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
@ -248,8 +248,8 @@ __global__ void scaled_upper_triang_masked_softmax_warp_backward(
(output_t)(scale * (grad_reg[i][it + element] -
output_reg[i][it + element] * sum[i]));
}
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
gradInput + i * element_count * stride + it * WARP_SIZE, out);
copy<output_t, ELEMENTS_PER_LDG_STG>(
out, gradInput + i * element_count * stride + it * WARP_SIZE);
}
}
}

19
extensions/csrc/kernel/cuda/utils/vec_copy.h

@ -8,25 +8,8 @@ namespace colossalAI {
namespace cuda {
namespace utils {
// Note(LiuYang): Depreciated
template <typename T, int VecSize>
__device__ __inline__ void copy_vector(T *dst, const T *src) {
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
*(reinterpret_cast<VT *>(dst)) = *(reinterpret_cast<const VT *>(src));
}
template <>
__device__ __inline__ void copy_vector<float, 8>(float *dst, const float *src) {
// Since the maximum memory alignment length is 128 bits, we choose float4
// here.
*(reinterpret_cast<float4 *>(dst)) = *(reinterpret_cast<const float4 *>(src));
*(reinterpret_cast<float4 *>(dst + 4)) =
*(reinterpret_cast<const float4 *>(src + 4));
}
// Note(LiuYang): Depreciated
template <typename T, int VecSize>
__device__ __inline__ void copy_zero_vector(T *dst) {
__device__ __inline__ void copy_zero(T *dst) {
using VT = typename common::VecTypeTrait<T, VecSize>::Type;
*(reinterpret_cast<VT *>(dst)) = funcs::CastFunctor<float, VT>()(0.0f);
}

Loading…
Cancel
Save