mirror of https://github.com/hpcaitech/ColossalAI
aibig-modeldata-parallelismdeep-learningdistributed-computingfoundation-modelsheterogeneous-traininghpcinferencelarge-scalemodel-parallelismpipeline-parallelism
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
33 lines
1.0 KiB
33 lines
1.0 KiB
from model import GPT3_pipeline_hybrid |
|
|
|
from colossalai.nn.optimizer import HybridAdam |
|
from colossalai.zero.shard_utils import TensorShardStrategy |
|
|
|
BATCH_SIZE = 192 |
|
NUM_EPOCHS = 60 |
|
SEQ_LEN = 2048 |
|
NUM_MICRO_BATCHES = 192 |
|
HIDDEN_SIZE = 12288 |
|
TENSOR_SHAPE = (BATCH_SIZE // NUM_MICRO_BATCHES, SEQ_LEN, HIDDEN_SIZE) |
|
|
|
# if you do no want zero, just comment out this dictionary |
|
zero = dict( |
|
model_config=dict(tensor_placement_policy="cuda", shard_strategy=TensorShardStrategy()), |
|
optimizer_config=dict(initial_scale=2**16), |
|
) |
|
|
|
optimizer = dict( |
|
type=HybridAdam, |
|
lr=0.00015, |
|
weight_decay=1e-2, |
|
) |
|
|
|
model = dict(type=GPT3_pipeline_hybrid, checkpoint=True, num_chunks=1) |
|
|
|
# pipeline parallel: modify integer value for the number of pipeline stages |
|
# tensor parallel: modify size to set the tensor parallel size, usually the number of GPUs per node |
|
# for the current model implementation, mode can only be 1D or None |
|
parallel = dict( |
|
pipeline=1, |
|
tensor=dict(size=2, mode="1d"), # for the current model implementation, mode can only be 1D or None |
|
)
|
|
|