mirror of https://github.com/hpcaitech/ColossalAI
[example] add benchmark.sh for gpt (#2226)
parent
3629e611cd
commit
49c601da21
|
@ -0,0 +1,22 @@
|
|||
for MODEL_NAME in "GPT2small"
|
||||
do
|
||||
for BATCH_SIZE in 8
|
||||
do
|
||||
for GPUNUM in 1 2 4 8
|
||||
do
|
||||
for TPDEGREE in 1 2 4 8
|
||||
do
|
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]
|
||||
then
|
||||
continue
|
||||
fi
|
||||
echo "****************** Begin ***************************"
|
||||
echo "* benchmrking MODEL_NAME ${MODEL_NAME} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE}"
|
||||
bash ./run.sh
|
||||
echo "****************** Finished ***************************"
|
||||
echo ""
|
||||
echo ""
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
|
@ -53,7 +53,7 @@ def gpt2_24b(checkpoint=True):
|
|||
return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
|
||||
|
||||
|
||||
def model_builder(model_size: str):
|
||||
def model_builder(model_size: str) -> callable:
|
||||
if model_size == "gpt2_medium":
|
||||
return gpt2_medium
|
||||
elif model_size == "gpt2_xl":
|
||||
|
@ -66,6 +66,8 @@ def model_builder(model_size: str):
|
|||
return gpt2_20b
|
||||
elif model_size == "gpt2_24b":
|
||||
return gpt2_24b
|
||||
else:
|
||||
raise TypeError(f"model_builder {model_size}")
|
||||
|
||||
|
||||
__all__ = ['model_builder']
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
|
||||
export DISTPAN="colossalai"
|
||||
export DISTPAN={$DISTPAN:-"colossalai"}
|
||||
|
||||
# The following options only valid when DISTPAN="colossalai"
|
||||
export TPDEGREE=1
|
||||
export GPUNUM=1
|
||||
export PLACEMENT='const'
|
||||
export USE_SHARD_INIT=False
|
||||
export BATCH_SIZE=32
|
||||
# export MODEL_TYPE="gpt2_10b"
|
||||
export TPDEGREE=${TPDEGREE:-1}
|
||||
export GPUNUM=${GPUNUM:-1}
|
||||
export PLACEMENT=${PLACEMENT:'const'}
|
||||
export USE_SHARD_INIT=${USE_SHARD_INIT:False}
|
||||
export BATCH_SIZE=${BATCH_SIZE:-8}
|
||||
export MODEL_TYPE=${MODEL_TYPE:"gpt2_medium"}
|
||||
|
||||
mkdir -p logs
|
||||
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|
||||
|
|
|
@ -5,7 +5,6 @@ from time import time
|
|||
import psutil
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from model_zoo import model_builder
|
||||
from packaging import version
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
|
||||
|
@ -17,6 +16,7 @@ from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, Proces
|
|||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero.sharded_optim import LowLevelZeroOptimizer
|
||||
from model_zoo import model_builder
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
@ -55,7 +55,7 @@ def parse_args():
|
|||
parser.add_argument(
|
||||
"--model_type",
|
||||
type=str,
|
||||
default='gpt2_medium',
|
||||
default="gpt2_medium",
|
||||
help="model model scale",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
@ -309,6 +309,8 @@ def main():
|
|||
if n >= WARMUP_STEPS:
|
||||
tflops_list.append(step_tflops)
|
||||
|
||||
logger.info(f"max memory {torch.cuda.memory_allocated() / 1024**2} MB", ranks=[0])
|
||||
|
||||
tflops_list.sort()
|
||||
median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
|
||||
logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
|
||||
|
|
Loading…
Reference in New Issue