mirror of https://github.com/hpcaitech/ColossalAI
Merge branch 'main' of https://github.com/hpcaitech/ColossalAI into dev0116
commit
236b4195ff
|
@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
|
||||||
export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
|
export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
|
||||||
export BATCH_SIZE=${BATCH_SIZE:-16}
|
export BATCH_SIZE=${BATCH_SIZE:-16}
|
||||||
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
|
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
|
||||||
|
export TRAIN_STEP=${TRAIN_STEP:-10}
|
||||||
# export PYTHONPATH=$PWD:$PYTHONPATH
|
# export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
|
||||||
mkdir -p gemini_logs
|
mkdir -p gemini_logs
|
||||||
|
@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
|
||||||
--placement=${PLACEMENT} \
|
--placement=${PLACEMENT} \
|
||||||
--shardinit=${USE_SHARD_INIT} \
|
--shardinit=${USE_SHARD_INIT} \
|
||||||
--distplan=${DISTPLAN} \
|
--distplan=${DISTPLAN} \
|
||||||
|
--train_step=${TRAIN_STEP} \
|
||||||
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
set -x
|
||||||
|
$(cd `dirname $0`;pwd)
|
||||||
|
export TRAIN_STEP=4
|
||||||
|
|
||||||
|
for MODEL_TYPE in "gpt2_medium"; do
|
||||||
|
for DISTPLAN in "colossalai"; do
|
||||||
|
for BATCH_SIZE in 2; do
|
||||||
|
for GPUNUM in 1 4; do
|
||||||
|
for TPDEGREE in 1 2; do
|
||||||
|
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
for PLACEMENT in "cpu" "auto"; do
|
||||||
|
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||||
|
bash ./run_gemini.sh
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
for DISTPLAN in "zero1" "zero2"; do
|
||||||
|
for BATCH_SIZE in 2; do
|
||||||
|
for GPUNUM in 1 4; do
|
||||||
|
for TPDEGREE in 1; do
|
||||||
|
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
|
||||||
|
bash ./run_gemini.sh
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
|
@ -65,7 +65,13 @@ def parse_args():
|
||||||
default="gpt2_medium",
|
default="gpt2_medium",
|
||||||
help="model model scale",
|
help="model model scale",
|
||||||
)
|
)
|
||||||
parser.add_argument("--steps", type=int, default=10, help="num of training steps")
|
parser.add_argument(
|
||||||
|
"--train_step",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="training iterations for test",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
@ -237,7 +243,8 @@ def main():
|
||||||
SEQ_LEN = 1024
|
SEQ_LEN = 1024
|
||||||
VOCAB_SIZE = 50257
|
VOCAB_SIZE = 50257
|
||||||
|
|
||||||
NUM_STEPS = args.steps
|
NUM_STEPS = args.train_step
|
||||||
|
|
||||||
WARMUP_STEPS = 1
|
WARMUP_STEPS = 1
|
||||||
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
|
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
|
||||||
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
|
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
|
||||||
|
|
|
@ -1,15 +1,2 @@
|
||||||
pip install -r requirements.txt
|
set -x
|
||||||
|
cd gemini && bash test_ci.sh
|
||||||
# test colossalai
|
|
||||||
for TP in 1 2; do
|
|
||||||
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
|
|
||||||
for SHARD in "True" "False"; do
|
|
||||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
|
|
||||||
done
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
# test zero1&2
|
|
||||||
for DIST in "zero1" "zero2"; do
|
|
||||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
|
|
||||||
done
|
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
for GPUNUM in 2 1
|
||||||
|
do
|
||||||
|
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
|
||||||
|
done
|
|
@ -8,4 +8,4 @@ export PLACEMENT='cpu'
|
||||||
export USE_SHARD_INIT=False
|
export USE_SHARD_INIT=False
|
||||||
export BATCH_SIZE=4
|
export BATCH_SIZE=4
|
||||||
|
|
||||||
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train_new.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
|
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
$(cd `dirname $0`;pwd)
|
||||||
|
|
||||||
|
for BATCH_SIZE in 2
|
||||||
|
do
|
||||||
|
for GPUNUM in 1 4
|
||||||
|
do
|
||||||
|
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log
|
||||||
|
done
|
||||||
|
done
|
|
@ -23,7 +23,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||||
|
|
||||||
# constants
|
# constants
|
||||||
|
|
||||||
NUM_BATCHES = int(100)
|
NUM_BATCHES = int(10)
|
||||||
WARMUP_BATCHES = 1
|
WARMUP_BATCHES = 1
|
||||||
GRADIENT_ACCUMULATE_EVERY = 1
|
GRADIENT_ACCUMULATE_EVERY = 1
|
||||||
LEARNING_RATE = 2e-4
|
LEARNING_RATE = 2e-4
|
||||||
|
@ -66,6 +66,12 @@ def parse_args():
|
||||||
default=8,
|
default=8,
|
||||||
help="batch size per DP group of training.",
|
help="batch size per DP group of training.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dummy_data",
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
help="use dummy dataset.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
@ -171,10 +177,23 @@ disable_existing_loggers()
|
||||||
colossalai.launch_from_torch(config={})
|
colossalai.launch_from_torch(config={})
|
||||||
logger = get_dist_logger()
|
logger = get_dist_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_dataset(dummy_data: bool = False):
|
||||||
|
if not dummy_data:
|
||||||
with gzip.open("./data/enwik8.gz") as file:
|
with gzip.open("./data/enwik8.gz") as file:
|
||||||
X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
|
X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
|
||||||
trX, vaX = np.split(X, [int(90e6)])
|
trX, vaX = np.split(X, [int(90e6)])
|
||||||
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
|
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
|
||||||
|
# print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
|
||||||
|
# print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}")
|
||||||
|
return data_train, data_val
|
||||||
|
else:
|
||||||
|
return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,))
|
||||||
|
|
||||||
|
|
||||||
|
data_train, data_val = generate_dataset(args.dummy_data)
|
||||||
|
|
||||||
|
print("generate dataset ready!")
|
||||||
|
|
||||||
|
|
||||||
class TextSamplerDataset(Dataset):
|
class TextSamplerDataset(Dataset):
|
||||||
|
|
Loading…
Reference in New Issue