2023-01-06 02:08:41 +00:00
|
|
|
set -x
|
|
|
|
export BS=${BS:-16}
|
|
|
|
export MEMCAP=${MEMCAP:-0}
|
2023-02-22 02:59:48 +00:00
|
|
|
# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b`
|
2023-01-06 02:08:41 +00:00
|
|
|
export MODEL=${MODEL:-"125m"}
|
|
|
|
export GPUNUM=${GPUNUM:-1}
|
2023-03-08 05:45:15 +00:00
|
|
|
export USE_SHARD_INIT=${USE_SHARD_INIT:-"false"}
|
2023-01-06 02:08:41 +00:00
|
|
|
|
|
|
|
# make directory for logs
|
|
|
|
mkdir -p ./logs
|
|
|
|
|
2023-03-08 05:45:15 +00:00
|
|
|
if [ ${USE_SHARD_INIT} = "true" ]; then
|
|
|
|
USE_SHARD_INIT="--shardinit"
|
|
|
|
else
|
|
|
|
USE_SHARD_INIT=""
|
|
|
|
fi
|
|
|
|
|
2023-01-06 02:08:41 +00:00
|
|
|
export MODLE_PATH="facebook/opt-${MODEL}"
|
|
|
|
|
|
|
|
# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
|
|
|
|
torchrun \
|
|
|
|
--nproc_per_node ${GPUNUM} \
|
|
|
|
--master_port 19198 \
|
|
|
|
train_gemini_opt.py \
|
|
|
|
--mem_cap ${MEMCAP} \
|
|
|
|
--model_name_or_path ${MODLE_PATH} \
|
2023-03-08 05:45:15 +00:00
|
|
|
${USE_SHARD_INIT} \
|
2023-01-06 02:08:41 +00:00
|
|
|
--batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
|