mirror of https://github.com/hpcaitech/ColossalAI
79 lines
2.5 KiB
C++
79 lines
2.5 KiB
C++
#pragma once
|
|
|
|
#include <cuda.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#include "nvgpu_dev_info.h"
|
|
|
|
namespace colossalAI {
|
|
namespace cuda {
|
|
namespace utils {
|
|
|
|
struct GPULaunchConfig {
|
|
dim3 block{1, 1, 1};
|
|
dim3 grid{1, 1, 1};
|
|
};
|
|
|
|
static GPULaunchConfig GetGPULaunchConfig1D(const NVGPUDevInfo& dev_info,
|
|
int64_t numel, int64_t vec_size) {
|
|
const int64_t max_threads_per_block = dev_info.GetMaxThreadsPerBlock();
|
|
const int64_t max_blocks_per_grid = dev_info.GetMaxGridDims()[0];
|
|
const int64_t kMinimumSize = 64;
|
|
const int64_t kMaximumSize = 512;
|
|
int64_t active_threads = (numel + vec_size - 1) / vec_size;
|
|
int64_t sm_num = dev_info.GetMultiProcessorCount();
|
|
|
|
// Note(LiuYang): expected threads should be in [64, 128, 256, 512] generally
|
|
int64_t expected_threads_per_block = kMaximumSize;
|
|
|
|
auto RoundUpToPowerOfTwo = [](int64_t x) {
|
|
bool is_power_of_two = false;
|
|
int64_t ret = 1;
|
|
int64_t y = x;
|
|
while (y > 0) {
|
|
is_power_of_two = ((ret ^ x) == 0);
|
|
y = (x >> 1);
|
|
ret = (ret << 1);
|
|
if (y > 0) is_power_of_two = false;
|
|
}
|
|
if (is_power_of_two) return x;
|
|
return ret;
|
|
};
|
|
|
|
if ((active_threads / (sm_num << 1)) < max_threads_per_block) {
|
|
expected_threads_per_block =
|
|
RoundUpToPowerOfTwo(active_threads / (sm_num << 1));
|
|
} else if ((active_threads / (sm_num << 2)) < max_threads_per_block) {
|
|
expected_threads_per_block =
|
|
RoundUpToPowerOfTwo(active_threads / (sm_num << 2));
|
|
}
|
|
|
|
expected_threads_per_block =
|
|
std::max(expected_threads_per_block, kMinimumSize);
|
|
int64_t expect_block_per_grid =
|
|
((active_threads + expected_threads_per_block - 1) /
|
|
expected_threads_per_block);
|
|
|
|
if (expect_block_per_grid > max_blocks_per_grid) {
|
|
expect_block_per_grid = max_blocks_per_grid;
|
|
expected_threads_per_block =
|
|
(active_threads + expect_block_per_grid - 1) / expect_block_per_grid;
|
|
if (expected_threads_per_block > max_threads_per_block)
|
|
throw std::invalid_argument(
|
|
"Threads required for current input exceed for current GPU!");
|
|
expected_threads_per_block =
|
|
RoundUpToPowerOfTwo(expected_threads_per_block);
|
|
expect_block_per_grid = ((active_threads + expected_threads_per_block - 1) /
|
|
expected_threads_per_block);
|
|
}
|
|
|
|
GPULaunchConfig config;
|
|
config.block.x = expected_threads_per_block;
|
|
config.grid.x = expect_block_per_grid;
|
|
return config;
|
|
}
|
|
|
|
} // namespace utils
|
|
} // namespace cuda
|
|
} // namespace colossalAI
|