mirror of https://github.com/hpcaitech/ColossalAI
aibig-modeldata-parallelismdeep-learningdistributed-computingfoundation-modelsheterogeneous-traininghpcinferencelarge-scalemodel-parallelismpipeline-parallelism
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
51 lines
1.0 KiB
51 lines
1.0 KiB
set -xe |
|
pip install -r requirements.txt |
|
|
|
# model name or path |
|
MODEL="google/vit-base-patch16-224" |
|
|
|
# path for saving model |
|
OUTPUT_PATH="./output_model" |
|
|
|
# plugin(training strategy) |
|
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"/"hybrid_parallel" |
|
PLUGIN="gemini" |
|
#PLUGIN="hybrid_parallel" |
|
|
|
# configuration of parallel group sizes, only used when setting PLUGIN to "hybrid_parallel" |
|
TP_SIZE=2 |
|
PP_SIZE=2 |
|
|
|
# number of gpus to use |
|
GPUNUM=4 |
|
|
|
# batch size per data parallel group |
|
BS=16 |
|
|
|
# learning rate |
|
LR="2e-4" |
|
|
|
# number of epoch |
|
EPOCH=3 |
|
|
|
# weight decay |
|
WEIGHT_DECAY=0.05 |
|
|
|
# ratio of warmup steps |
|
WARMUP_RATIO=0.3 |
|
|
|
# run the script for demo |
|
colossalai run \ |
|
--nproc_per_node ${GPUNUM} \ |
|
--master_port 29505 \ |
|
vit_train_demo.py \ |
|
--model_name_or_path ${MODEL} \ |
|
--output_path ${OUTPUT_PATH} \ |
|
--plugin ${PLUGIN} \ |
|
--batch_size ${BS} \ |
|
--tp_size ${TP_SIZE} \ |
|
--pp_size ${PP_SIZE} \ |
|
--num_epoch ${EPOCH} \ |
|
--learning_rate ${LR} \ |
|
--weight_decay ${WEIGHT_DECAY} \ |
|
--warmup_ratio ${WARMUP_RATIO}
|
|
|