[example] update gemini benchmark bash (#2306)

pull/2312/head
HELSON 2023-01-04 11:59:26 +08:00 committed by GitHub
parent 9b765e7a69
commit e00cedd181
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 28 additions and 24 deletions

View File

@ -1,22 +1,20 @@
for MODEL_TYPE in "gpt2_medium"
do
for BATCH_SIZE in 16
do
for GPUNUM in 1 2 4 8
do
for TPDEGREE in 1 2 4 8
do
if [ ${TPDEGREE} -gt ${GPUNUM} ]
then
continue
fi
echo "****************** Begin ***************************"
echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE}"
MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} bash ./run_gemini.sh
echo "****************** Finished ***************************"
echo ""
echo ""
done
done
done
for MODEL_TYPE in "gpt2_medium"; do
for BATCH_SIZE in 16; do
for GPUNUM in 1 2 4 8; do
for TPDEGREE in 1 2 4 8; do
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
continue
fi
for PLACEMENT in "cpu" "auto"; do
echo "****************** Begin ***************************"
echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}"
MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
bash ./run_gemini.sh
echo "****************** Finished ***************************"
echo ""
echo ""
done
done
done
done
done

View File

@ -10,4 +10,11 @@ export BATCH_SIZE=${BATCH_SIZE:-16}
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
mkdir -p gemini_logs
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py \
--tp_degree=${TPDEGREE} \
--model_type=${MODEL_TYPE} \
--batch_size=${BATCH_SIZE} \
--placement=${PLACEMENT} \
--shardinit=${USE_SHARD_INIT} \
--distplan=${DISTPAN} \
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log

View File

@ -217,8 +217,7 @@ def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str
def main():
# version check
# this example is supposed to work for versions less than 0.2.0 but greater than 0.1.9
assert version.parse(CAI_VERSION) < version.parse("0.2.0")
# this example is supposed to work for versions greater than 0.1.9
assert version.parse(CAI_VERSION) >= version.parse("0.1.9")
set_cpu_maximum_parallelism()