diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index 0c2ea660f..6f0710d54 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"} export USE_SHARD_INIT=${USE_SHARD_INIT:-False} export BATCH_SIZE=${BATCH_SIZE:-16} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} - +export TRAIN_STEP=${TRAIN_STEP:-10} # export PYTHONPATH=$PWD:$PYTHONPATH mkdir -p gemini_logs @@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \ --placement=${PLACEMENT} \ --shardinit=${USE_SHARD_INIT} \ --distplan=${DISTPLAN} \ +--train_step=${TRAIN_STEP} \ 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh new file mode 100644 index 000000000..6079d5ed6 --- /dev/null +++ b/examples/language/gpt/gemini/test_ci.sh @@ -0,0 +1,35 @@ +set -x +$(cd `dirname $0`;pwd) +export TRAIN_STEP=4 + +for MODEL_TYPE in "gpt2_medium"; do + for DISTPLAN in "colossalai"; do + for BATCH_SIZE in 2; do + for GPUNUM in 1 4; do + for TPDEGREE in 1 2; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + for PLACEMENT in "cpu" "auto"; do + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ + bash ./run_gemini.sh + done + done + done + done + done + + for DISTPLAN in "zero1" "zero2"; do + for BATCH_SIZE in 2; do + for GPUNUM in 1 4; do + for TPDEGREE in 1; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\ + bash ./run_gemini.sh + done + done + done + done +done diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index f77be12d2..713de6f9f 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -65,7 +65,13 @@ def parse_args(): default="gpt2_medium", help="model model scale", ) - parser.add_argument("--steps", type=int, default=10, help="num of training steps") + parser.add_argument( + "--train_step", + type=int, + default=10, + help="training iterations for test", + ) + args = parser.parse_args() return args @@ -237,7 +243,8 @@ def main(): SEQ_LEN = 1024 VOCAB_SIZE = 50257 - NUM_STEPS = args.steps + NUM_STEPS = args.train_step + WARMUP_STEPS = 1 assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index d04ece182..d67c17229 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,15 +1,2 @@ -pip install -r requirements.txt - -# test colossalai -for TP in 1 2; do - for PLACEMENT in "cpu" "cuda" "auto" "const"; do - for SHARD in "True" "False"; do - colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1 - done - done -done - -# test zero1&2 -for DIST in "zero1" "zero2"; do - colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1 -done +set -x +cd gemini && bash test_ci.sh diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh new file mode 100644 index 000000000..317f602cd --- /dev/null +++ b/examples/language/opt/test_ci.sh @@ -0,0 +1,4 @@ +for GPUNUM in 2 1 +do +env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh +done diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh index 4aa868953..7a533509e 100644 --- a/examples/language/palm/run.sh +++ b/examples/language/palm/run.sh @@ -8,4 +8,4 @@ export PLACEMENT='cpu' export USE_SHARD_INIT=False export BATCH_SIZE=4 -env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train_new.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log \ No newline at end of file +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh new file mode 100644 index 000000000..f21095578 --- /dev/null +++ b/examples/language/palm/test_ci.sh @@ -0,0 +1,9 @@ +$(cd `dirname $0`;pwd) + +for BATCH_SIZE in 2 +do +for GPUNUM in 1 4 +do +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log +done +done diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py index b17496954..2f012780d 100644 --- a/examples/language/palm/train.py +++ b/examples/language/palm/train.py @@ -23,7 +23,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext # constants -NUM_BATCHES = int(100) +NUM_BATCHES = int(10) WARMUP_BATCHES = 1 GRADIENT_ACCUMULATE_EVERY = 1 LEARNING_RATE = 2e-4 @@ -66,6 +66,12 @@ def parse_args(): default=8, help="batch size per DP group of training.", ) + parser.add_argument( + "--dummy_data", + type=bool, + default=False, + help="use dummy dataset.", + ) args = parser.parse_args() return args @@ -171,10 +177,23 @@ disable_existing_loggers() colossalai.launch_from_torch(config={}) logger = get_dist_logger() -with gzip.open("./data/enwik8.gz") as file: - X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) - trX, vaX = np.split(X, [int(90e6)]) - data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + +def generate_dataset(dummy_data: bool = False): + if not dummy_data: + with gzip.open("./data/enwik8.gz") as file: + X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) + trX, vaX = np.split(X, [int(90e6)]) + data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + # print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}") + # print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}") + return data_train, data_val + else: + return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,)) + + +data_train, data_val = generate_dataset(args.dummy_data) + +print("generate dataset ready!") class TextSamplerDataset(Dataset):