[example] add benchmark.sh for gpt (#2226)

pull/2238/head
Jiarui Fang 2022-12-29 12:00:00 +08:00 committed by GitHub
parent 3629e611cd
commit 49c601da21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 36 additions and 10 deletions

View File

@ -0,0 +1,22 @@
for MODEL_NAME in "GPT2small"
do
for BATCH_SIZE in 8
do
for GPUNUM in 1 2 4 8
do
for TPDEGREE in 1 2 4 8
do
if [ ${TPDEGREE} -gt ${GPUNUM} ]
then
continue
fi
echo "****************** Begin ***************************"
echo "* benchmrking MODEL_NAME ${MODEL_NAME} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE}"
bash ./run.sh
echo "****************** Finished ***************************"
echo ""
echo ""
done
done
done
done

View File

@ -53,7 +53,7 @@ def gpt2_24b(checkpoint=True):
return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
def model_builder(model_size: str):
def model_builder(model_size: str) -> callable:
if model_size == "gpt2_medium":
return gpt2_medium
elif model_size == "gpt2_xl":
@ -66,6 +66,8 @@ def model_builder(model_size: str):
return gpt2_20b
elif model_size == "gpt2_24b":
return gpt2_24b
else:
raise TypeError(f"model_builder {model_size}")
__all__ = ['model_builder']

View File

@ -1,13 +1,13 @@
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
export DISTPAN="colossalai"
export DISTPAN={$DISTPAN:-"colossalai"}
# The following options only valid when DISTPAN="colossalai"
export TPDEGREE=1
export GPUNUM=1
export PLACEMENT='const'
export USE_SHARD_INIT=False
export BATCH_SIZE=32
# export MODEL_TYPE="gpt2_10b"
export TPDEGREE=${TPDEGREE:-1}
export GPUNUM=${GPUNUM:-1}
export PLACEMENT=${PLACEMENT:'const'}
export USE_SHARD_INIT=${USE_SHARD_INIT:False}
export BATCH_SIZE=${BATCH_SIZE:-8}
export MODEL_TYPE=${MODEL_TYPE:"gpt2_medium"}
mkdir -p logs
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log

View File

@ -5,7 +5,6 @@ from time import time
import psutil
import torch
import torch.nn as nn
from model_zoo import model_builder
from packaging import version
from torch.nn.parallel import DistributedDataParallel as DDP
@ -17,6 +16,7 @@ from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, Proces
from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext
from colossalai.zero.sharded_optim import LowLevelZeroOptimizer
from model_zoo import model_builder
def parse_args():
@ -55,7 +55,7 @@ def parse_args():
parser.add_argument(
"--model_type",
type=str,
default='gpt2_medium',
default="gpt2_medium",
help="model model scale",
)
args = parser.parse_args()
@ -309,6 +309,8 @@ def main():
if n >= WARMUP_STEPS:
tflops_list.append(step_tflops)
logger.info(f"max memory {torch.cuda.memory_allocated() / 1024**2} MB", ranks=[0])
tflops_list.sort()
median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")