torchrun --standalone --nproc_per_node=4 shardformer_benchmark.py \
    --model "bert" \
    --pretrain "bert-base-uncased" \
    --max_epochs 1 \
    --batch_size 2 \
    --lr 2.4e-5 \
    --fused_layernorm False \
    --accumulation_steps 8 \
    --warmup_fraction 0.03