[NFC] polish pre-commit run --files colossalai/kernel/cuda_native/csrc/scaled_upper_triang_masked_softmax_cuda.cu code style (#943)

pull/997/head
HaoyuQin 2022-05-13 17:29:56 +08:00 committed by binmakeswell
parent 5bbefeb06a
commit c0f373db5d
1 changed files with 28 additions and 38 deletions

View File

@ -2,12 +2,13 @@
* with minor changes. */ * with minor changes. */
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <ATen/cuda/CUDAContext.h> #include <cuda_runtime.h>
#include <torch/extension.h> #include <torch/extension.h>
#include "scaled_upper_triang_masked_softmax.h" #include "scaled_upper_triang_masked_softmax.h"
#include "type_shim.h" #include "type_shim.h"
@ -15,18 +16,15 @@ namespace multihead_attn {
namespace fused_softmax { namespace fused_softmax {
namespace scaled_upper_triang_masked_softmax { namespace scaled_upper_triang_masked_softmax {
torch::Tensor fwd_cuda( torch::Tensor fwd_cuda(torch::Tensor const& input, float scale_factor) {
torch::Tensor const& input,
float scale_factor)
{
// input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
const int attn_batches = input.size(0); const int attn_batches = input.size(0);
const int seq_len = input.size(1); const int seq_len = input.size(1);
TORCH_INTERNAL_ASSERT(seq_len <= 2048); TORCH_INTERNAL_ASSERT(seq_len <= 2048);
// Output // Output
auto act_options = input.options().requires_grad(false); auto act_options = input.options().requires_grad(false);
torch::Tensor softmax_results = torch::Tensor softmax_results =
torch::empty({attn_batches, seq_len, seq_len}, act_options); torch::empty({attn_batches, seq_len, seq_len}, act_options);
// Softmax Intermediate Result Ptr // Softmax Intermediate Result Ptr
@ -36,50 +34,42 @@ torch::Tensor fwd_cuda(
DISPATCH_HALF_AND_BFLOAT( DISPATCH_HALF_AND_BFLOAT(
input.scalar_type(), input.scalar_type(),
"dispatch_scaled_upper_triang_masked_softmax_forward", "dispatch_scaled_upper_triang_masked_softmax_forward",
dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>( dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t,
reinterpret_cast<scalar_t*>(softmax_results_ptr), float>(
reinterpret_cast<const scalar_t*>(input_ptr), reinterpret_cast<scalar_t*>(softmax_results_ptr),
scale_factor, reinterpret_cast<const scalar_t*>(input_ptr), scale_factor, seq_len,
seq_len, seq_len, attn_batches););
seq_len,
attn_batches);
);
return softmax_results; return softmax_results;
} }
torch::Tensor bwd_cuda( torch::Tensor bwd_cuda(torch::Tensor const& output_grads_,
torch::Tensor const& output_grads_, torch::Tensor const& softmax_results_,
torch::Tensor const& softmax_results_, float scale_factor) {
float scale_factor) {
auto output_grads = output_grads_.contiguous(); auto output_grads = output_grads_.contiguous();
auto softmax_results = softmax_results_.contiguous(); auto softmax_results = softmax_results_.contiguous();
//output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] // output grads is a 3d tensor with dimensions [attn_batches, seq_len,
// seq_len]
const int attn_batches = output_grads.size(0); const int attn_batches = output_grads.size(0);
const int seq_len = output_grads.size(1); const int seq_len = output_grads.size(1);
TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr()); void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
//Softmax Grad // Softmax Grad
DISPATCH_HALF_AND_BFLOAT( DISPATCH_HALF_AND_BFLOAT(
output_grads_.scalar_type(), output_grads_.scalar_type(),
"dispatch_scaled_upper_triang_masked_softmax_backward", "dispatch_scaled_upper_triang_masked_softmax_backward",
dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>( dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t,
reinterpret_cast<scalar_t*>(output_grads_ptr), float>(
reinterpret_cast<scalar_t*>(output_grads_ptr), reinterpret_cast<scalar_t*>(output_grads_ptr),
reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()), reinterpret_cast<scalar_t*>(output_grads_ptr),
scale_factor, reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
seq_len, scale_factor, seq_len, seq_len, attn_batches););
seq_len,
attn_batches); // backward pass is completely in-place
);
//backward pass is completely in-place
return output_grads; return output_grads;
} }
} } // namespace scaled_upper_triang_masked_softmax
} } // namespace fused_softmax
} } // namespace multihead_attn