ColossalAI/examples/images/vit/run_demo.sh

set -xe
pip install -r requirements.txt

# model name or path
MODEL="google/vit-base-patch16-224"

# path for saving model
OUTPUT_PATH="./output_model"

# plugin(training strategy)
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"/"hybrid_parallel"
PLUGIN="gemini"
#PLUGIN="hybrid_parallel"

# configuration of parallel group sizes, only used when setting PLUGIN to "hybrid_parallel"
TP_SIZE=2
PP_SIZE=2

# number of gpus to use
GPUNUM=4

# batch size per data parallel group
BS=16

# learning rate
LR="2e-4"

# number of epoch
EPOCH=3

# weight decay
WEIGHT_DECAY=0.05

# ratio of warmup steps
WARMUP_RATIO=0.3

# run the script for demo
torchrun \
  --standalone \
  --nproc_per_node ${GPUNUM} \
  vit_train_demo.py \
  --model_name_or_path ${MODEL} \
  --output_path ${OUTPUT_PATH} \
  --plugin ${PLUGIN} \
  --batch_size ${BS} \
  --tp_size ${TP_SIZE} \
  --pp_size ${PP_SIZE} \
  --num_epoch ${EPOCH} \
  --learning_rate ${LR} \
  --weight_decay ${WEIGHT_DECAY} \
  --warmup_ratio ${WARMUP_RATIO}
[example] update ViT example using booster api (#3940) 2023-06-12 07:02:27 +00:00			`set -xe`
			`pip install -r requirements.txt`

			`# model name or path`
			`MODEL="google/vit-base-patch16-224"`

			`# path for saving model`
[example] update vit example for hybrid parallel plugin (#4641) * update vit example for hybrid plugin * reset tp/pp size * fix dataloader iteration bug * update optimizer passing in evaluation/add grad_accum * change criterion * wrap tqdm * change grad_accum to grad_checkpoint * fix pbar 2023-09-07 09:38:45 +00:00			`OUTPUT_PATH="./output_model"`
[example] update ViT example using booster api (#3940) 2023-06-12 07:02:27 +00:00
			`# plugin(training strategy)`
[example] update vit example for hybrid parallel plugin (#4641) * update vit example for hybrid plugin * reset tp/pp size * fix dataloader iteration bug * update optimizer passing in evaluation/add grad_accum * change criterion * wrap tqdm * change grad_accum to grad_checkpoint * fix pbar 2023-09-07 09:38:45 +00:00			`# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"/"hybrid_parallel"`
[example] update ViT example using booster api (#3940) 2023-06-12 07:02:27 +00:00			`PLUGIN="gemini"`
[example] update vit example for hybrid parallel plugin (#4641) * update vit example for hybrid plugin * reset tp/pp size * fix dataloader iteration bug * update optimizer passing in evaluation/add grad_accum * change criterion * wrap tqdm * change grad_accum to grad_checkpoint * fix pbar 2023-09-07 09:38:45 +00:00			`#PLUGIN="hybrid_parallel"`

			`# configuration of parallel group sizes, only used when setting PLUGIN to "hybrid_parallel"`
			`TP_SIZE=2`
			`PP_SIZE=2`
[example] update ViT example using booster api (#3940) 2023-06-12 07:02:27 +00:00
			`# number of gpus to use`
			`GPUNUM=4`

[example] update vit example for hybrid parallel plugin (#4641) * update vit example for hybrid plugin * reset tp/pp size * fix dataloader iteration bug * update optimizer passing in evaluation/add grad_accum * change criterion * wrap tqdm * change grad_accum to grad_checkpoint * fix pbar 2023-09-07 09:38:45 +00:00			`# batch size per data parallel group`
[example] update ViT example using booster api (#3940) 2023-06-12 07:02:27 +00:00			`BS=16`

			`# learning rate`
			`LR="2e-4"`

			`# number of epoch`
			`EPOCH=3`

			`# weight decay`
			`WEIGHT_DECAY=0.05`

			`# ratio of warmup steps`
			`WARMUP_RATIO=0.3`

			`# run the script for demo`
			`torchrun \`
			`--standalone \`
			`--nproc_per_node ${GPUNUM} \`
			`vit_train_demo.py \`
			`--model_name_or_path ${MODEL} \`
			`--output_path ${OUTPUT_PATH} \`
			`--plugin ${PLUGIN} \`
			`--batch_size ${BS} \`
[example] update vit example for hybrid parallel plugin (#4641) * update vit example for hybrid plugin * reset tp/pp size * fix dataloader iteration bug * update optimizer passing in evaluation/add grad_accum * change criterion * wrap tqdm * change grad_accum to grad_checkpoint * fix pbar 2023-09-07 09:38:45 +00:00			`--tp_size ${TP_SIZE} \`
			`--pp_size ${PP_SIZE} \`
[example] update ViT example using booster api (#3940) 2023-06-12 07:02:27 +00:00			`--num_epoch ${EPOCH} \`
			`--learning_rate ${LR} \`
			`--weight_decay ${WEIGHT_DECAY} \`
			`--warmup_ratio ${WARMUP_RATIO}`