[example] update gpt example for larger model scale (#2211)

pull/2212/head^2
Jiarui Fang 2022-12-28 13:54:08 +08:00 committed by GitHub
parent 24246f7aa5
commit d5e3e3ec01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 122 additions and 57 deletions

View File

@ -59,7 +59,6 @@ class MemStatsCollector:
return [t - self._sampling_time[0] for t in self._sampling_time] return [t - self._sampling_time[0] for t in self._sampling_time]
def start_collection(self): def start_collection(self):
print('start collection')
self._start_flag = True self._start_flag = True
self._mem_monitor.start() self._mem_monitor.start()
@ -68,7 +67,6 @@ class MemStatsCollector:
# self._step_total = len(self._sampling_time) # self._step_total = len(self._sampling_time)
self._step_total = len(self._memstats.non_model_data_list('cuda')) self._step_total = len(self._memstats.non_model_data_list('cuda'))
self._start_flag = False self._start_flag = False
self._mem_monitor.finish()
print(f'finish_collection {self._step_total}') print(f'finish_collection {self._step_total}')
# deprecated # deprecated

View File

@ -62,7 +62,7 @@ ColossalAI version 0.1.13.
How dose Batch Size affect the efficency. How dose Batch Size affect the efficency.
| model | #GPU | policy | TP |batch | Tflops | | model | #GPU | policy | TP | batch per DP | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- | | ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_10b | 2 | cpu | 1 | 32 | 122.046 | | gpt2_10b | 2 | cpu | 1 | 32 | 122.046 |
| gpt2_10b | 2 | cpu | 1 | 16 | 82.649 | | gpt2_10b | 2 | cpu | 1 | 16 | 82.649 |
@ -71,7 +71,7 @@ How dose Batch Size affect the efficency.
How dose the Placement Policy affect the efficency. How dose the Placement Policy affect the efficency.
| model | #GPU | policy | TP |batch | Tflops | | model | #GPU | policy | TP | batch per DP | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- | | ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_10b | 4 | auto | 1 | 8 | 88.657 | | gpt2_10b | 4 | auto | 1 | 8 | 88.657 |
| gpt2_10b | 4 | cuda | 1 | 8 | OOM | | gpt2_10b | 4 | cuda | 1 | 8 | OOM |
@ -80,9 +80,23 @@ How dose the Placement Policy affect the efficency.
How dose the Tensor Parallel Degree affect the efficency. How dose the Tensor Parallel Degree affect the efficency.
| model | #GPU | policy | TP |batch | Tflops | | model | #GPU | policy | TP | batch per DP | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- | | ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_10b | 4 | auto | 1 | 8 | 88.657 | | gpt2_10b | 4 | auto | 1 | 8 | 88.657 |
| gpt2_10b | 4 | auto | 2 | 8 | 56.687 | | gpt2_10b | 4 | auto | 2 | 8 | 56.687 |
| gpt2_10b | 4 | auto | 4 | 8 | 29.019 | | gpt2_10b | 4 | auto | 4 | 8 | 29.019 |
| gpt2_10b | 4 | auto | 4 | 64 | 50.411 | | gpt2_10b | 4 | auto | 4 | 64 | 50.411 |
| gpt2_20b | 1 | cpu | 1 | 8 | 43.102 |
| gpt2_20b | 4 | cpu | 4 | 8 | 28.491 |
Touch the bar of model scale and batch size.
| model | #GPU | policy | TP | batch per DP | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_20b | 4 | cpu | 1 | 64 | CUDA OOM |
| gpt2_20b | 4 | auto | 1/2 | 64 | CUDA OOM |
| gpt2_20b | 4 | cpu | 2 | 64 | 121.394 |
| gpt2_20b | 4 | cpu | 2 | 8 | 43.102 |
| gpt2_20b | 8 | cpu | 2 | 64 | 125.170 |

View File

@ -0,0 +1,71 @@
from torch import nn
from transformers import GPT2Config, GPT2LMHeadModel
## Define the Model and Loss Based on Huggingface transformers GPT2LMHeadModel
class GPTLMModel(nn.Module):
def __init__(self,
hidden_size=768,
num_layers=12,
num_attention_heads=12,
max_seq_len=1024,
vocab_size=50257,
checkpoint=False):
super().__init__()
self.checkpoint = checkpoint
self.model = GPT2LMHeadModel(
GPT2Config(n_embd=hidden_size,
n_layer=num_layers,
n_head=num_attention_heads,
n_positions=max_seq_len,
n_ctx=max_seq_len,
vocab_size=vocab_size))
if checkpoint:
self.model.gradient_checkpointing_enable()
def forward(self, input_ids, attention_mask):
# Only return lm_logits
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
def gpt2_medium(checkpoint=False):
return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
def gpt2_xl(checkpoint=True):
return GPTLMModel(hidden_size=1600, num_layers=48, num_attention_heads=32, checkpoint=checkpoint)
def gpt2_10b(checkpoint=True):
return GPTLMModel(hidden_size=4096, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
def gpt2_14b(checkpoint=True):
return GPTLMModel(hidden_size=4096, num_layers=70, num_attention_heads=16, checkpoint=checkpoint)
def gpt2_20b(checkpoint=True):
return GPTLMModel(hidden_size=8192, num_layers=25, num_attention_heads=16, checkpoint=checkpoint)
def gpt2_24b(checkpoint=True):
return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
def model_builder(model_size: str):
if model_size == "gpt2_medium":
return gpt2_medium
elif model_size == "gpt2_xl":
return gpt2_xl
elif model_size == "gpt2_10b":
return gpt2_10b
elif model_size == "gpt2_14b":
return gpt2_14b
elif model_size == "gpt2_20b":
return gpt2_20b
elif model_size == "gpt2_24b":
return gpt2_24b
__all__ = ['model_builder']

View File

@ -2,9 +2,12 @@
export DISTPAN="colossalai" export DISTPAN="colossalai"
# The following options only valid when DISTPAN="colossalai" # The following options only valid when DISTPAN="colossalai"
export TPDEGREE=4 export TPDEGREE=2
export GPUNUM=4 export GPUNUM=8
export PLACEMENT='auto' export PLACEMENT='cpu'
export USE_SHARD_INIT=False export USE_SHARD_INIT=False
export BATCH_SIZE=64
export MODEL_TYPE="gpt2_20b"
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log mkdir -p logs
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log

View File

@ -6,18 +6,16 @@ import torch
import torch.nn as nn import torch.nn as nn
from packaging import version from packaging import version
from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import GPT2Config, GPT2LMHeadModel
import colossalai import colossalai
from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.optimizer import HybridAdam
from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
from colossalai.nn.optimizer.zero_optimizer import ZeroOptimizer
from colossalai.nn.parallel import ZeroDDP from colossalai.nn.parallel import ZeroDDP
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from colossalai.utils.model.colo_init_context import ColoInitContext from colossalai.utils.model.colo_init_context import ColoInitContext
from colossalai.zero.sharded_optim import LowLevelZeroOptimizer from colossalai.zero.sharded_optim import LowLevelZeroOptimizer
from model_zoo import model_builder
def parse_args(): def parse_args():
@ -47,6 +45,18 @@ def parse_args():
help= help=
"Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.", "Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
) )
parser.add_argument(
"--batch_size",
type=int,
default=8,
help="batch size per DP group of training.",
)
parser.add_argument(
"--model_type",
type=str,
default='gpt2_medium',
help="model model scale",
)
args = parser.parse_args() args = parser.parse_args()
return args return args
@ -65,33 +75,6 @@ def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
split_param_single_dim_tp1d(-1, param, pg) split_param_single_dim_tp1d(-1, param, pg)
## Define the Model and Loss Based on Huggingface transformers GPT2LMHeadModel
class GPTLMModel(nn.Module):
def __init__(self,
hidden_size=768,
num_layers=12,
num_attention_heads=12,
max_seq_len=1024,
vocab_size=50257,
checkpoint=False):
super().__init__()
self.checkpoint = checkpoint
self.model = GPT2LMHeadModel(
GPT2Config(n_embd=hidden_size,
n_layer=num_layers,
n_head=num_attention_heads,
n_positions=max_seq_len,
n_ctx=max_seq_len,
vocab_size=vocab_size))
if checkpoint:
self.model.gradient_checkpointing_enable()
def forward(self, input_ids, attention_mask):
# Only return lm_logits
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
class GPTLMLoss(nn.Module): class GPTLMLoss(nn.Module):
def __init__(self): def __init__(self):
@ -112,18 +95,6 @@ def get_data(batch_size, seq_len, vocab_size):
return input_ids, attention_mask return input_ids, attention_mask
def gpt2_medium(checkpoint=False):
return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
def gpt2_xl(checkpoint=True):
return GPTLMModel(hidden_size=1600, num_layers=48, num_attention_heads=32, checkpoint=checkpoint)
def gpt2_10b(checkpoint=True):
return GPTLMModel(hidden_size=4096, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
def get_cpu_mem(): def get_cpu_mem():
return psutil.Process().memory_info().rss / 1024**2 return psutil.Process().memory_info().rss / 1024**2
@ -210,7 +181,8 @@ def main():
if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]: if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
raise TypeError(f"{args.distplan} is error") raise TypeError(f"{args.distplan} is error")
BATCH_SIZE = 64 # batch size per DP degree
BATCH_SIZE = args.batch_size
SEQ_LEN = 1024 SEQ_LEN = 1024
VOCAB_SIZE = 50257 VOCAB_SIZE = 50257
@ -220,7 +192,7 @@ def main():
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch(config={})
logger = get_dist_logger() logger = get_dist_logger()
logger.info(f"using dist plan {args.distplan}", ranks=[0]) logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
# build criterion # build criterion
criterion = GPTLMLoss() criterion = GPTLMLoss()
@ -232,8 +204,11 @@ def main():
default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
# build GPT model # build GPT model
with ColoInitContext(device=get_current_device(), default_dist_spec=default_dist_spec, default_pg=default_pg): with ColoInitContext(device=get_current_device(),
model = gpt2_10b(checkpoint=True) dtype=torch.half,
default_dist_spec=default_dist_spec,
default_pg=default_pg):
model = model_builder(args.model_type)(checkpoint=True)
pg = default_pg pg = default_pg
# Tensor Parallelism (TP) # Tensor Parallelism (TP)
@ -246,7 +221,7 @@ def main():
optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5) optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
logger.info(get_mem_info(prefix='After init optim, '), ranks=[0]) logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
else: else:
model = gpt2_10b(checkpoint=True).cuda() model = model_builder(args.model_type)(checkpoint=True).cuda()
if args.distplan.startswith("torch"): if args.distplan.startswith("torch"):
model = DDP(model) model = DDP(model)
@ -262,10 +237,14 @@ def main():
overlap_communication=True, overlap_communication=True,
partition_grad=partition_flag, partition_grad=partition_flag,
verbose=True) verbose=True)
# notice that the model is still in fp32
# model is shared after TP
numel = sum([p.numel() for p in model.parameters()]) numel = sum([p.numel() for p in model.parameters()])
logger.info(get_mem_info(prefix='After init model, '), ranks=[0]) logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
# Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
# = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
# = batch_per_DP_group * numel * seq_len * 8
get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN) get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
torch.cuda.synchronize() torch.cuda.synchronize()