mirror of https://github.com/hpcaitech/ColossalAI
[example] update gemini benchmark bash (#2306)
parent
9b765e7a69
commit
e00cedd181
|
@ -1,22 +1,20 @@
|
||||||
for MODEL_TYPE in "gpt2_medium"
|
for MODEL_TYPE in "gpt2_medium"; do
|
||||||
do
|
for BATCH_SIZE in 16; do
|
||||||
for BATCH_SIZE in 16
|
for GPUNUM in 1 2 4 8; do
|
||||||
do
|
for TPDEGREE in 1 2 4 8; do
|
||||||
for GPUNUM in 1 2 4 8
|
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||||
do
|
|
||||||
for TPDEGREE in 1 2 4 8
|
|
||||||
do
|
|
||||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]
|
|
||||||
then
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
echo "****************** Begin ***************************"
|
for PLACEMENT in "cpu" "auto"; do
|
||||||
echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE}"
|
echo "****************** Begin ***************************"
|
||||||
MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} bash ./run_gemini.sh
|
echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}"
|
||||||
echo "****************** Finished ***************************"
|
MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||||
echo ""
|
bash ./run_gemini.sh
|
||||||
echo ""
|
echo "****************** Finished ***************************"
|
||||||
done
|
echo ""
|
||||||
done
|
echo ""
|
||||||
done
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
done
|
done
|
||||||
|
|
|
@ -10,4 +10,11 @@ export BATCH_SIZE=${BATCH_SIZE:-16}
|
||||||
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
|
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
|
||||||
|
|
||||||
mkdir -p gemini_logs
|
mkdir -p gemini_logs
|
||||||
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py \
|
||||||
|
--tp_degree=${TPDEGREE} \
|
||||||
|
--model_type=${MODEL_TYPE} \
|
||||||
|
--batch_size=${BATCH_SIZE} \
|
||||||
|
--placement=${PLACEMENT} \
|
||||||
|
--shardinit=${USE_SHARD_INIT} \
|
||||||
|
--distplan=${DISTPAN} \
|
||||||
|
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
||||||
|
|
|
@ -217,8 +217,7 @@ def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# version check
|
# version check
|
||||||
# this example is supposed to work for versions less than 0.2.0 but greater than 0.1.9
|
# this example is supposed to work for versions greater than 0.1.9
|
||||||
assert version.parse(CAI_VERSION) < version.parse("0.2.0")
|
|
||||||
assert version.parse(CAI_VERSION) >= version.parse("0.1.9")
|
assert version.parse(CAI_VERSION) >= version.parse("0.1.9")
|
||||||
|
|
||||||
set_cpu_maximum_parallelism()
|
set_cpu_maximum_parallelism()
|
||||||
|
|
Loading…
Reference in New Issue