mirror of https://github.com/hpcaitech/ColossalAI
[hotfix] fix gpt gemini example (#2404)
* [hotfix] fix gpt gemini example * [example] add new assertionspull/2405/head
parent
9880fd2cd8
commit
498b5ca993
|
@ -1,4 +1,5 @@
|
||||||
for MODEL_TYPE in "gpt2_medium"; do
|
for MODEL_TYPE in "gpt2_medium"; do
|
||||||
|
for DISPAN in "colossalai"; do
|
||||||
for BATCH_SIZE in 16; do
|
for BATCH_SIZE in 16; do
|
||||||
for GPUNUM in 1 2 4 8; do
|
for GPUNUM in 1 2 4 8; do
|
||||||
for TPDEGREE in 1 2 4 8; do
|
for TPDEGREE in 1 2 4 8; do
|
||||||
|
@ -7,9 +8,9 @@ for MODEL_TYPE in "gpt2_medium"; do
|
||||||
fi
|
fi
|
||||||
for PLACEMENT in "cpu" "auto"; do
|
for PLACEMENT in "cpu" "auto"; do
|
||||||
echo "****************** Begin ***************************"
|
echo "****************** Begin ***************************"
|
||||||
echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}"
|
echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
|
||||||
MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||||
bash ./gemini/run_gemini.sh
|
bash ./run_gemini.sh
|
||||||
echo "****************** Finished ***************************"
|
echo "****************** Finished ***************************"
|
||||||
echo ""
|
echo ""
|
||||||
echo ""
|
echo ""
|
||||||
|
@ -17,4 +18,5 @@ for MODEL_TYPE in "gpt2_medium"; do
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
done
|
||||||
done
|
done
|
||||||
|
|
|
@ -270,6 +270,7 @@ def main():
|
||||||
|
|
||||||
tp_pg = ProcessGroup(tp_degree=args.tp_degree)
|
tp_pg = ProcessGroup(tp_degree=args.tp_degree)
|
||||||
# Tensor Parallelism (TP)
|
# Tensor Parallelism (TP)
|
||||||
|
# You should notice that v0.1.10 is not compatible with TP degree > 1
|
||||||
tensor_parallelize(model, tp_pg)
|
tensor_parallelize(model, tp_pg)
|
||||||
|
|
||||||
# build a Gemini model and a highly optimized cpu optimizer
|
# build a Gemini model and a highly optimized cpu optimizer
|
||||||
|
@ -278,6 +279,7 @@ def main():
|
||||||
|
|
||||||
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
|
||||||
else:
|
else:
|
||||||
|
assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
|
||||||
model = model_builder(args.model_type)(checkpoint=True).cuda()
|
model = model_builder(args.model_type)(checkpoint=True).cuda()
|
||||||
|
|
||||||
if args.distplan.startswith("torch"):
|
if args.distplan.startswith("torch"):
|
||||||
|
|
Loading…
Reference in New Issue