add reusable utils for cuda

pull/5435/head
xs_courtesy 2024-03-08 14:53:29 +08:00
parent 593a72e4d5
commit a46598ac59
6 changed files with 284 additions and 0 deletions

View File

@ -0,0 +1,20 @@
#pragma once
#include <memory>
#include "common/nvgpu_dev_info.h"
#include "target.h"
namespace colossalAI {
namespace common {
template <typename Ret>
class DevInfoMgr final {
public:
static std::unique_ptr<Ret> GetDevInfo(int device_num) const {
return std::make_unique<Ret>(device_num);
}
};
} // namespace common
} // namespace colossalAI

View File

@ -0,0 +1,134 @@
#pragma once
#include <exception>
#include <iostream>
#include <string>
namespace colossalAI {
namespace common {
class Target {
public:
enum class OS : int {
Unk = -1,
Linux,
Windows,
};
enum class Arch : int {
Unk = -1,
X86,
Arm,
NVGPU,
AMDGPU,
Ascend,
};
enum class BitLen : int {
Unk = -1,
k32,
k64,
};
explicit Target(OS os, Arch arch, BitLen bitlen)
: os_(os), arch_(arch), bitlen_(bitlen) {}
bool defined() const {
return (os_ != OS::Unk) && (arch_ != Arch::Unk) && (bitlen_ != BitLen::Unk);
}
std::string str() const {
std::string s{"OS: "};
switch (os_) {
case OS::Unk:
s += "Unk";
break;
case OS::Linux:
s += "Linux";
break;
case OS::Windows:
s += "Windows";
break;
default:
throw std::invalid_argument("Invalid OS type!");
}
s += "\t";
s += "Arch: ";
switch (arch_) {
case Arch::Unk:
s += "Unk";
break;
case Arch::X86:
s += "X86";
break;
case Arch::Arm:
s += "Arm";
break;
case Arch::NVGPU:
s += "NVGPU";
break;
case Arch::AMDGPU:
s += "AMDGPU";
break;
case Arch::Ascend:
s += "Ascend";
break;
default:
throw std::invalid_argument("Invalid Arch type!");
}
s += "\t";
s += "BitLen: ";
switch (bitlen_) {
case BitLen::Unk:
s += "Unk";
break;
case BitLen::k32:
s += "k32";
break;
case BitLen::k64:
s += "k64";
break;
default:
throw std::invalid_argument("Invalid target bit length!");
}
return s;
}
OS os() const { return os_; }
Arch arch() const { return arch_; }
BitLen bitlen() const { return bitlen_; }
static Target DefaultX86Target();
static Target DefaultArmTarget();
static Target DefaultRocmTarget();
static Target DefaultAscendTarget();
static Target DefaultCUDATarget() {
return Target(OS::Linux, Arch::CUDA, BitLen::k64);
}
friend std::ostream& operator<<(std::ostream& os, const Target& target);
friend bool operator==(const Target& lhs, const Target& rhs);
friend bool operator!=(const Target& lhs, const Target& rhs);
private:
OS os_{OS::Unk};
Arch arch_{Arch::Unk};
BitLen bitlen_{BitLen::Unk};
};
std::ostream& operator<<(std::ostream& os, const Target& target) {
std::cout << target.str() << std::endl;
}
bool operator==(const Target& lhs, const Target& rhs) {
return (lhs.os_ == rhs.os_) && (lhs.arch_ == rhs.arch_) &&
(lhs.bitlen_ == rhs.bitlen_);
}
bool operator!=(const Target& lhs, const Target& rhs) {
return (lhs.os_ != rhs.os_) && (lhs.arch_ != rhs.arch_) &&
(lhs.bitlen_ != rhs.bitlen_);
}
} // namespace common
} // namespace colossalAI

View File

@ -0,0 +1,36 @@
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
namespace colossalAI {
namespace cuda {
namespace utils {
GPULaunchConfig GPUGetGPULaunchConfig1D(int64_t numel, int vec_size);
// TODO(LiuYang): to be implemented
GPULaunchConfig GPUGetGPULaunchConfig2D(int64_t numel, int vec_size);
// TODO(LiuYang): to be implemented
GPULaunchConfig GPUGetGPULaunchConfig3D(int64_t numel, int vec_size);
class GPULaunchConfig {
public:
GPULaunchConfig(){};
GPULaunchConfig(const dim3& block, const dim3& grid)
: block_(block), grid_(grid) {}
friend GPULaunchConfig GPUGetGPULaunchConfig1D(int64_t numel, int vec_size);
protected:
void set_block(const dim3& dim) { block_ = dim; }
void set_grid(const dim3& dim) { grid_ = dim; }
private:
dim3 block_(1, 1, 1);
dim3 grid_(1, 1, 1);
}
} // namespace utils
} // namespace cuda
} // namespace colossalAI

View File

@ -0,0 +1,12 @@
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#define CUDA_CHECK(func) \
{ \
auto status = func; \
if (status != cudaSuccess) { \
LOG(FATAL) << "CUDA Error : " << cudaGetErrorString(status); \
} \
}

View File

@ -0,0 +1,45 @@
#include "nvgpu_dev_info.h"
#include <array>
namespace colossalAI {
namespace cuda {
namespace utils {
std::array<int, 3> NVGPUDevInfo::GetMaxGridDims() const {
std::array<int, 3> ret;
ret[0] = prop_->maxGridSize[0];
ret[1] = prop_->maxGridSize[1];
ret[2] = prop_->maxGridSize[2];
return ret;
}
std::array<int, 3> NVGPUDevInfo::GetMaxBlockDims() const {
std::array<int, 3> ret;
ret[0] = prop_->maxThreadsDim[0];
ret[1] = prop_->maxThreadsDim[1];
ret[2] = prop_->maxThreadsDim[2];
return ret;
}
std::array<int, 2> NVGPUDevInfo::GetCapability() const {
std::array<int, 2> ret;
ret[0] = prop_.major;
ret[1] = prop_.minor;
}
int NVGPUDevInfo::GetMultiProcessorCount() const {
return prop_->multiProcessorCount;
}
int NVGPUDevInfo::GetMaxThreadsPerMultiProcessor() const {
return prop_->maxThreadsPerMultiProcessor;
}
int NVGPUDevInfo::GetMaxThreadsPerBlock() const {
return prop_->maxThreadsPerBlock;
}
} // namespace utils
} // namespace cuda
} // namespace colossalAI

View File

@ -0,0 +1,37 @@
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <ostream>
#include <string>
#include <vector>
#include "micros.h"
#include "target.h"
namespace colossalAI {
namespace cuda {
namespace utils {
class NVGPUDevInfo {
public:
explicit NVGPUDevInfo(int device_num) : device_num_(device_num) {
CUDA_CALL(cudaGetDeviceProperties(prop_, device));
}
std::array<int, 3> GetMaxGridDims() const;
std::array<int, 3> GetMaxBlockDims() const;
std::array<int, 2> GetCapability() const;
int GetMultiProcessorCount() const;
int GetMaxThreadsPerMultiProcessor() const;
int GetMaxThreadsPerBlock() const;
private:
int device_num_;
cudaDeviceProp* prop_;
};
} // namespace utils
} // namespace cuda
} // namespace colossalAI