2023-06-12 07:02:27 +00:00
|
|
|
set -xe
|
|
|
|
pip install -r requirements.txt
|
|
|
|
|
|
|
|
# model name or path
|
|
|
|
MODEL="google/vit-base-patch16-224"
|
|
|
|
|
|
|
|
# path for saving model
|
2023-09-07 09:38:45 +00:00
|
|
|
OUTPUT_PATH="./output_model"
|
2023-06-12 07:02:27 +00:00
|
|
|
|
|
|
|
# plugin(training strategy)
|
2023-09-07 09:38:45 +00:00
|
|
|
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"/"hybrid_parallel"
|
2023-06-12 07:02:27 +00:00
|
|
|
PLUGIN="gemini"
|
2023-09-07 09:38:45 +00:00
|
|
|
#PLUGIN="hybrid_parallel"
|
|
|
|
|
|
|
|
# configuration of parallel group sizes, only used when setting PLUGIN to "hybrid_parallel"
|
|
|
|
TP_SIZE=2
|
|
|
|
PP_SIZE=2
|
2023-06-12 07:02:27 +00:00
|
|
|
|
|
|
|
# number of gpus to use
|
|
|
|
GPUNUM=4
|
|
|
|
|
2023-09-07 09:38:45 +00:00
|
|
|
# batch size per data parallel group
|
2023-06-12 07:02:27 +00:00
|
|
|
BS=16
|
|
|
|
|
|
|
|
# learning rate
|
|
|
|
LR="2e-4"
|
|
|
|
|
|
|
|
# number of epoch
|
|
|
|
EPOCH=3
|
|
|
|
|
|
|
|
# weight decay
|
|
|
|
WEIGHT_DECAY=0.05
|
|
|
|
|
|
|
|
# ratio of warmup steps
|
|
|
|
WARMUP_RATIO=0.3
|
|
|
|
|
|
|
|
# run the script for demo
|
|
|
|
torchrun \
|
|
|
|
--standalone \
|
|
|
|
--nproc_per_node ${GPUNUM} \
|
|
|
|
vit_train_demo.py \
|
|
|
|
--model_name_or_path ${MODEL} \
|
|
|
|
--output_path ${OUTPUT_PATH} \
|
|
|
|
--plugin ${PLUGIN} \
|
|
|
|
--batch_size ${BS} \
|
2023-09-07 09:38:45 +00:00
|
|
|
--tp_size ${TP_SIZE} \
|
|
|
|
--pp_size ${PP_SIZE} \
|
2023-06-12 07:02:27 +00:00
|
|
|
--num_epoch ${EPOCH} \
|
|
|
|
--learning_rate ${LR} \
|
|
|
|
--weight_decay ${WEIGHT_DECAY} \
|
|
|
|
--warmup_ratio ${WARMUP_RATIO}
|