mirror of https://github.com/hpcaitech/ColossalAI
[builder] raise Error when CUDA_HOME is not set (#2213)
parent
78a89d9b41
commit
7675792100
|
@ -30,6 +30,13 @@ class Builder(object):
|
||||||
else:
|
else:
|
||||||
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
|
return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
|
||||||
|
|
||||||
|
def get_cuda_include(self):
|
||||||
|
from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
if CUDA_HOME is None:
|
||||||
|
raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.")
|
||||||
|
cuda_include = os.path.join(CUDA_HOME, "include")
|
||||||
|
return cuda_include
|
||||||
|
|
||||||
def strip_empty_entries(self, args):
|
def strip_empty_entries(self, args):
|
||||||
'''
|
'''
|
||||||
Drop any empty strings from the list of compile and link flags
|
Drop any empty strings from the list of compile and link flags
|
||||||
|
|
|
@ -27,9 +27,7 @@ class CPUAdamBuilder(Builder):
|
||||||
]
|
]
|
||||||
|
|
||||||
def include_paths(self):
|
def include_paths(self):
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), self.get_cuda_include()]
|
||||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
|
||||||
return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), cuda_include]
|
|
||||||
|
|
||||||
def strip_empty_entries(self, args):
|
def strip_empty_entries(self, args):
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -31,10 +31,7 @@ class FusedOptimBuilder(Builder):
|
||||||
]
|
]
|
||||||
|
|
||||||
def include_paths(self):
|
def include_paths(self):
|
||||||
import torch
|
return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), self.get_cuda_include()]
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
|
||||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
|
||||||
return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), cuda_include]
|
|
||||||
|
|
||||||
def builder(self, name):
|
def builder(self, name):
|
||||||
from torch.utils.cpp_extension import CUDAExtension
|
from torch.utils.cpp_extension import CUDAExtension
|
||||||
|
|
|
@ -31,10 +31,8 @@ class MultiHeadAttnBuilder(Builder):
|
||||||
]
|
]
|
||||||
|
|
||||||
def include_paths(self):
|
def include_paths(self):
|
||||||
from torch.utils.cpp_extension import CUDA_HOME
|
|
||||||
ret = []
|
ret = []
|
||||||
cuda_include = os.path.join(CUDA_HOME, "include")
|
ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_include()]
|
||||||
ret = [os.path.join(self.base_dir, "includes"), cuda_include]
|
|
||||||
ret.append(os.path.join(self.base_dir, "kernels", "include"))
|
ret.append(os.path.join(self.base_dir, "kernels", "include"))
|
||||||
print("include_paths", ret)
|
print("include_paths", ret)
|
||||||
return ret
|
return ret
|
||||||
|
|
|
@ -106,3 +106,8 @@ Touch the bar of model scale and batch size.
|
||||||
| gpt2_20b | 8 | auto | 2 | 16 | 99.871 |
|
| gpt2_20b | 8 | auto | 2 | 16 | 99.871 |
|
||||||
| gpt2_20b | 8 | cpu | 2 | 64 | 125.170 |
|
| gpt2_20b | 8 | cpu | 2 | 64 | 125.170 |
|
||||||
| gpt2_20b | 8 | const | 2 | 32 | 105.415 |
|
| gpt2_20b | 8 | const | 2 | 32 | 105.415 |
|
||||||
|
|
||||||
|
|
||||||
|
| model | #GPU | policy | TP | batch per DP | Tflops |
|
||||||
|
| ---------- | --------- |--------- |--------- |--------- |--------- |
|
||||||
|
| gpt2_20b | 8 | cpu | 2 | 8 | 46.895 |
|
||||||
|
|
|
@ -2,12 +2,12 @@
|
||||||
export DISTPAN="colossalai"
|
export DISTPAN="colossalai"
|
||||||
|
|
||||||
# The following options only valid when DISTPAN="colossalai"
|
# The following options only valid when DISTPAN="colossalai"
|
||||||
export TPDEGREE=2
|
export TPDEGREE=4
|
||||||
export GPUNUM=8
|
export GPUNUM=8
|
||||||
export PLACEMENT='cpu'
|
export PLACEMENT='cpu'
|
||||||
export USE_SHARD_INIT=False
|
export USE_SHARD_INIT=False
|
||||||
export BATCH_SIZE=64
|
export BATCH_SIZE=32
|
||||||
export MODEL_TYPE="gpt2_20b"
|
# export MODEL_TYPE="gpt2_24b"
|
||||||
|
|
||||||
mkdir -p logs
|
mkdir -p logs
|
||||||
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|
||||||
|
|
|
@ -218,7 +218,7 @@ def main():
|
||||||
model = gemini_zero_dpp(model, pg, args.placement)
|
model = gemini_zero_dpp(model, pg, args.placement)
|
||||||
|
|
||||||
# build highly optimized cpu optimizer
|
# build highly optimized cpu optimizer
|
||||||
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
|
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5, gpu_margin_mem_ratio=0.6)
|
||||||
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
||||||
else:
|
else:
|
||||||
model = model_builder(args.model_type)(checkpoint=True).cuda()
|
model = model_builder(args.model_type)(checkpoint=True).cuda()
|
||||||
|
|
Loading…
Reference in New Issue