mirror of https://github.com/hpcaitech/ColossalAI
[builder] raise Error when CUDA_HOME is not set (#2213)
parent
78a89d9b41
commit
7675792100
|
@ -30,6 +30,13 @@ class Builder(object):
|
|||
else:
|
||||
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
|
||||
|
||||
def get_cuda_include(self):
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
if CUDA_HOME is None:
|
||||
raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.")
|
||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
||||
return cuda_include
|
||||
|
||||
def strip_empty_entries(self, args):
|
||||
'''
|
||||
Drop any empty strings from the list of compile and link flags
|
||||
|
|
|
@ -27,9 +27,7 @@ class CPUAdamBuilder(Builder):
|
|||
]
|
||||
|
||||
def include_paths(self):
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
||||
return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), cuda_include]
|
||||
return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), self.get_cuda_include()]
|
||||
|
||||
def strip_empty_entries(self, args):
|
||||
'''
|
||||
|
|
|
@ -31,10 +31,7 @@ class FusedOptimBuilder(Builder):
|
|||
]
|
||||
|
||||
def include_paths(self):
|
||||
import torch
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
||||
return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), cuda_include]
|
||||
return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), self.get_cuda_include()]
|
||||
|
||||
def builder(self, name):
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
|
|
|
@ -31,10 +31,8 @@ class MultiHeadAttnBuilder(Builder):
|
|||
]
|
||||
|
||||
def include_paths(self):
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
ret = []
|
||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
||||
ret = [os.path.join(self.base_dir, "includes"), cuda_include]
|
||||
ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_include()]
|
||||
ret.append(os.path.join(self.base_dir, "kernels", "include"))
|
||||
print("include_paths", ret)
|
||||
return ret
|
||||
|
|
|
@ -106,3 +106,8 @@ Touch the bar of model scale and batch size.
|
|||
| gpt2_20b | 8 | auto | 2 | 16 | 99.871 |
|
||||
| gpt2_20b | 8 | cpu | 2 | 64 | 125.170 |
|
||||
| gpt2_20b | 8 | const | 2 | 32 | 105.415 |
|
||||
|
||||
|
||||
| model | #GPU | policy | TP | batch per DP | Tflops |
|
||||
| ---------- | --------- |--------- |--------- |--------- |--------- |
|
||||
| gpt2_20b | 8 | cpu | 2 | 8 | 46.895 |
|
||||
|
|
|
@ -2,12 +2,12 @@
|
|||
export DISTPAN="colossalai"
|
||||
|
||||
# The following options only valid when DISTPAN="colossalai"
|
||||
export TPDEGREE=2
|
||||
export TPDEGREE=4
|
||||
export GPUNUM=8
|
||||
export PLACEMENT='cpu'
|
||||
export USE_SHARD_INIT=False
|
||||
export BATCH_SIZE=64
|
||||
export MODEL_TYPE="gpt2_20b"
|
||||
export BATCH_SIZE=32
|
||||
# export MODEL_TYPE="gpt2_24b"
|
||||
|
||||
mkdir -p logs
|
||||
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|
||||
|
|
|
@ -218,7 +218,7 @@ def main():
|
|||
model = gemini_zero_dpp(model, pg, args.placement)
|
||||
|
||||
# build highly optimized cpu optimizer
|
||||
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
|
||||
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5, gpu_margin_mem_ratio=0.6)
|
||||
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
||||
else:
|
||||
model = model_builder(args.model_type)(checkpoint=True).cuda()
|
||||
|
|
Loading…
Reference in New Issue