mirror of https://github.com/hpcaitech/ColossalAI
[hotfix] add DISTPAN argument for benchmark (#2412)
* change the benchmark config file * change config * revert config file * rename distpan to distplanpull/2417/head
parent
7d5640b9db
commit
d84e747975
|
@ -1,5 +1,5 @@
|
|||
for MODEL_TYPE in "gpt2_medium"; do
|
||||
for DISPAN in "colossalai"; do
|
||||
for DISTPLAN in "colossalai"; do
|
||||
for BATCH_SIZE in 16; do
|
||||
for GPUNUM in 1 2 4 8; do
|
||||
for TPDEGREE in 1 2 4 8; do
|
||||
|
@ -8,8 +8,8 @@ for MODEL_TYPE in "gpt2_medium"; do
|
|||
fi
|
||||
for PLACEMENT in "cpu" "auto"; do
|
||||
echo "****************** Begin ***************************"
|
||||
echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
|
||||
MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||
echo "+ benchmrking MODEL ${MODEL_TYPE} DISTPLAN ${DISTPLAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
|
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||
bash ./run_gemini.sh
|
||||
echo "****************** Finished ***************************"
|
||||
echo ""
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
set -x
|
||||
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
|
||||
export DISTPAN=${DISTPAN:-"colossalai"}
|
||||
export DISTPLAN=${DISTPLAN:-"colossalai"}
|
||||
|
||||
# The following options only valid when DISTPAN="colossalai"
|
||||
# The following options only valid when DISTPLAN="colossalai"
|
||||
export GPUNUM=${GPUNUM:-1}
|
||||
export TPDEGREE=${TPDEGREE:-1}
|
||||
export PLACEMENT=${PLACEMENT:-"cpu"}
|
||||
|
@ -20,5 +20,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
|
|||
--batch_size=${BATCH_SIZE} \
|
||||
--placement=${PLACEMENT} \
|
||||
--shardinit=${USE_SHARD_INIT} \
|
||||
--distplan=${DISTPAN} \
|
||||
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
||||
--distplan=${DISTPLAN} \
|
||||
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
||||
|
|
|
@ -290,9 +290,11 @@ def main():
|
|||
from torch.distributed.optim import ZeroRedundancyOptimizer
|
||||
optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
|
||||
elif args.distplan.startswith("zero"):
|
||||
model = model.half()
|
||||
partition_flag = args.distplan == "zero2"
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
|
||||
optimizer = LowLevelZeroOptimizer(optimizer,
|
||||
reduce_bucket_size=12 * 1024 * 1024,
|
||||
overlap_communication=True,
|
||||
partition_grad=partition_flag,
|
||||
verbose=True)
|
||||
|
|
Loading…
Reference in New Issue