pull/2484/head
jiaruifang 2 years ago
commit 236b4195ff

@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
export USE_SHARD_INIT=${USE_SHARD_INIT:-False} export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
export BATCH_SIZE=${BATCH_SIZE:-16} export BATCH_SIZE=${BATCH_SIZE:-16}
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
export TRAIN_STEP=${TRAIN_STEP:-10}
# export PYTHONPATH=$PWD:$PYTHONPATH # export PYTHONPATH=$PWD:$PYTHONPATH
mkdir -p gemini_logs mkdir -p gemini_logs
@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
--placement=${PLACEMENT} \ --placement=${PLACEMENT} \
--shardinit=${USE_SHARD_INIT} \ --shardinit=${USE_SHARD_INIT} \
--distplan=${DISTPLAN} \ --distplan=${DISTPLAN} \
--train_step=${TRAIN_STEP} \
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log

@ -0,0 +1,35 @@
set -x
$(cd `dirname $0`;pwd)
export TRAIN_STEP=4
for MODEL_TYPE in "gpt2_medium"; do
for DISTPLAN in "colossalai"; do
for BATCH_SIZE in 2; do
for GPUNUM in 1 4; do
for TPDEGREE in 1 2; do
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
continue
fi
for PLACEMENT in "cpu" "auto"; do
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
bash ./run_gemini.sh
done
done
done
done
done
for DISTPLAN in "zero1" "zero2"; do
for BATCH_SIZE in 2; do
for GPUNUM in 1 4; do
for TPDEGREE in 1; do
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
continue
fi
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
bash ./run_gemini.sh
done
done
done
done
done

@ -65,7 +65,13 @@ def parse_args():
default="gpt2_medium", default="gpt2_medium",
help="model model scale", help="model model scale",
) )
parser.add_argument("--steps", type=int, default=10, help="num of training steps") parser.add_argument(
"--train_step",
type=int,
default=10,
help="training iterations for test",
)
args = parser.parse_args() args = parser.parse_args()
return args return args
@ -237,7 +243,8 @@ def main():
SEQ_LEN = 1024 SEQ_LEN = 1024
VOCAB_SIZE = 50257 VOCAB_SIZE = 50257
NUM_STEPS = args.steps NUM_STEPS = args.train_step
WARMUP_STEPS = 1 WARMUP_STEPS = 1
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "

@ -1,15 +1,2 @@
pip install -r requirements.txt set -x
cd gemini && bash test_ci.sh
# test colossalai
for TP in 1 2; do
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
for SHARD in "True" "False"; do
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
done
done
done
# test zero1&2
for DIST in "zero1" "zero2"; do
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
done

@ -0,0 +1,4 @@
for GPUNUM in 2 1
do
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
done

@ -8,4 +8,4 @@ export PLACEMENT='cpu'
export USE_SHARD_INIT=False export USE_SHARD_INIT=False
export BATCH_SIZE=4 export BATCH_SIZE=4
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train_new.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log

@ -0,0 +1,9 @@
$(cd `dirname $0`;pwd)
for BATCH_SIZE in 2
do
for GPUNUM in 1 4
do
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log
done
done

@ -23,7 +23,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
# constants # constants
NUM_BATCHES = int(100) NUM_BATCHES = int(10)
WARMUP_BATCHES = 1 WARMUP_BATCHES = 1
GRADIENT_ACCUMULATE_EVERY = 1 GRADIENT_ACCUMULATE_EVERY = 1
LEARNING_RATE = 2e-4 LEARNING_RATE = 2e-4
@ -66,6 +66,12 @@ def parse_args():
default=8, default=8,
help="batch size per DP group of training.", help="batch size per DP group of training.",
) )
parser.add_argument(
"--dummy_data",
type=bool,
default=False,
help="use dummy dataset.",
)
args = parser.parse_args() args = parser.parse_args()
return args return args
@ -171,10 +177,23 @@ disable_existing_loggers()
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch(config={})
logger = get_dist_logger() logger = get_dist_logger()
with gzip.open("./data/enwik8.gz") as file:
X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) def generate_dataset(dummy_data: bool = False):
trX, vaX = np.split(X, [int(90e6)]) if not dummy_data:
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) with gzip.open("./data/enwik8.gz") as file:
X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
trX, vaX = np.split(X, [int(90e6)])
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
# print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
# print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}")
return data_train, data_val
else:
return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,))
data_train, data_val = generate_dataset(args.dummy_data)
print("generate dataset ready!")
class TextSamplerDataset(Dataset): class TextSamplerDataset(Dataset):

Loading…
Cancel
Save