torchrun --standalone --nproc_per_node=4 convergence_benchmark.py \
--model "bert" \
--pretrain "bert-base-uncased" \
--max_epochs 3 \
--batch_size 2 \
--lr 2.4e-5 \
--fused_layernorm False \
--accumulation_steps 8 \
--warmup_fraction 0.03