ColossalAI/examples/community/roberta/pretraining/run_pretrain_resume.sh

41 lines
1.3 KiB
Bash
Raw Normal View History

#!/usr/bin/env sh
root_path=$PWD
PY_FILE_PATH="$root_path/run_pretraining.py"
tensorboard_path="$root_path/tensorboard"
log_path="$root_path/exp_log"
ckpt_path="$root_path/ckpt"
mkdir -p $tensorboard_path
mkdir -p $log_path
mkdir -p $ckpt_path
export PYTHONPATH=$PWD
env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
--include GPU002,GPU003,GPU004,GPU007 \
--nproc_per_node=8 \
$PY_FILE_PATH \
--master_addr GPU007 \
--master_port 20024 \
--lr 2.0e-4 \
--train_micro_batch_size_per_gpu 190 \
--eval_micro_batch_size_per_gpu 20 \
--epoch 15 \
--data_path_prefix /h5 \
--eval_data_path_prefix /eval_h5 \
--tokenizer_path /roberta \
--bert_config /roberta/config.json \
--tensorboard_path $tensorboard_path \
--log_path $log_path \
--ckpt_path $ckpt_path \
--log_interval 50 \
--mlm bert \
--wandb \
--checkpoint_activations \
--resume_train \
--load_pretrain_model /ckpt/1.pt \
--load_optimizer_lr /ckpt/1.op_lrs \