mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
478 lines
18 KiB
478 lines
18 KiB
import math
|
|
from typing import Callable
|
|
|
|
import torch
|
|
from colossalai import nn as col_nn
|
|
from colossalai.builder.pipeline import partition_uniform
|
|
from colossalai.context import ParallelMode
|
|
from colossalai.core import global_context as gpc
|
|
from colossalai.logging import get_dist_logger
|
|
from colossalai.nn.layer.utils import CheckpointModule, divide
|
|
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
|
from colossalai.registry import LAYERS, LOSSES, MODELS
|
|
from colossalai.utils import get_current_device
|
|
from torch import dtype, nn
|
|
|
|
__all__ = [
|
|
'GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt2_xl_pipeline',
|
|
'gpt2_8B_pipeline', 'gpt3', 'gpt3_pipeline'
|
|
]
|
|
|
|
|
|
@LAYERS.register_module
|
|
class GPTEmbedding(nn.Module):
|
|
|
|
def __init__(self,
|
|
embedding_dim: int,
|
|
vocab_size: int,
|
|
max_position_embeddings: int,
|
|
num_tokentypes: int = 0,
|
|
padding_idx: int = None,
|
|
dropout: float = 0.,
|
|
dtype: dtype = None) -> None:
|
|
super().__init__()
|
|
self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype)
|
|
self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype)
|
|
if num_tokentypes > 0:
|
|
self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype)
|
|
else:
|
|
self.tokentype_embeddings = None
|
|
self.dropout = col_nn.Dropout(dropout)
|
|
|
|
@property
|
|
def word_embedding_weight(self):
|
|
return self.word_embeddings.weight
|
|
|
|
def forward(self, input_ids, position_ids=None, tokentype_ids=None):
|
|
seq_length = input_ids.size(1)
|
|
if position_ids is None:
|
|
position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0)
|
|
x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids)
|
|
if self.tokentype_embeddings is not None and tokentype_ids is not None:
|
|
x = x + self.tokentype_embeddings(tokentype_ids)
|
|
x = self.dropout(x)
|
|
|
|
return x
|
|
|
|
|
|
@LAYERS.register_module
|
|
class GPTSelfAttention(nn.Module):
|
|
|
|
def __init__(self,
|
|
dim: int,
|
|
num_heads: int,
|
|
attention_dropout: float,
|
|
dropout: float,
|
|
bias: bool = True,
|
|
fuse_scale_mask_softmax: bool = False,
|
|
dtype: dtype = None) -> None:
|
|
super().__init__()
|
|
self.fuse_scale_mask_softmax = fuse_scale_mask_softmax
|
|
self.attention_head_size = divide(dim, num_heads)
|
|
self.query_key_value = col_nn.Linear(dim, 3 * dim, dtype=dtype, bias=bias)
|
|
if fuse_scale_mask_softmax:
|
|
from colossalai.kernel import FusedScaleMaskSoftmax
|
|
from colossalai.kernel.cuda_native.scaled_softmax import \
|
|
AttnMaskType
|
|
self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True,
|
|
input_in_bf16=False,
|
|
attn_mask_type=AttnMaskType.causal,
|
|
scaled_masked_softmax_fusion=True,
|
|
mask_func=None,
|
|
softmax_in_fp32=True,
|
|
scale=math.sqrt(self.attention_head_size))
|
|
else:
|
|
self.softmax = nn.Softmax(dim=-1)
|
|
self.attention_dropout = col_nn.Dropout(attention_dropout)
|
|
self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True)
|
|
self.dropout = col_nn.Dropout(dropout)
|
|
|
|
def forward(self, x, attention_mask=None):
|
|
qkv = self.query_key_value(x)
|
|
all_head_size = qkv.shape[-1] // 3
|
|
num_attention_heads = divide(all_head_size, self.attention_head_size)
|
|
new_qkv_shape = qkv.shape[:-1] + \
|
|
(num_attention_heads, 3 * self.attention_head_size)
|
|
qkv = qkv.view(new_qkv_shape)
|
|
qkv = qkv.permute((0, 2, 1, 3))
|
|
q, k, v = torch.chunk(qkv, 3, dim=-1)
|
|
|
|
x = torch.matmul(q, k.transpose(-1, -2))
|
|
|
|
if self.fuse_scale_mask_softmax:
|
|
x = self.softmax(x, attention_mask)
|
|
else:
|
|
x = x / math.sqrt(self.attention_head_size)
|
|
# causal mask
|
|
q_len, k_len = q.size(-2), k.size(-2)
|
|
causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8,
|
|
device=get_current_device())).view(1, 1, q_len, k_len).bool()
|
|
x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device()))
|
|
if attention_mask is not None:
|
|
x = x + attention_mask
|
|
x = self.softmax(x)
|
|
|
|
x = self.attention_dropout(x)
|
|
|
|
x = torch.matmul(x, v)
|
|
x = x.transpose(1, 2)
|
|
new_context_layer_shape = x.size()[:-2] + (all_head_size,)
|
|
x = x.reshape(new_context_layer_shape)
|
|
|
|
x = self.dense(x)
|
|
x = self.dropout(x)
|
|
|
|
return x
|
|
|
|
|
|
@LAYERS.register_module
|
|
class GPTMLP(nn.Module):
|
|
|
|
def __init__(self,
|
|
dim: int,
|
|
mlp_ratio: float,
|
|
activation: Callable,
|
|
dropout: float,
|
|
dtype: dtype = None,
|
|
bias: bool = True):
|
|
super().__init__()
|
|
intermediate_dim = int(dim * mlp_ratio)
|
|
self.dense_1 = col_nn.Linear(dim, intermediate_dim, dtype=dtype, bias=bias)
|
|
self.activation = activation
|
|
self.dense_2 = col_nn.Linear(intermediate_dim, dim, dtype=dtype, bias=bias)
|
|
self.dropout = col_nn.Dropout(dropout)
|
|
|
|
def forward(self, x):
|
|
x = self.dense_1(x)
|
|
x = self.activation(x)
|
|
x = self.dense_2(x)
|
|
x = self.dropout(x)
|
|
return x
|
|
|
|
|
|
@LAYERS.register_module
|
|
class GPTBlock(CheckpointModule):
|
|
|
|
def __init__(self,
|
|
dim: int,
|
|
num_heads: int,
|
|
mlp_ratio: float,
|
|
activation: Callable,
|
|
attention_dropout: float = 0.,
|
|
dropout: float = 0.,
|
|
layernorm_epsilon: float = 1e-5,
|
|
dtype: dtype = None,
|
|
bias: bool = True,
|
|
apply_post_layernorm: bool = False,
|
|
fuse_scale_mask_softmax: bool = False,
|
|
checkpoint: bool = False,
|
|
activation_offload: bool = False):
|
|
super().__init__(checkpoint, activation_offload)
|
|
self.apply_post_layernorm = apply_post_layernorm
|
|
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
|
self.attn = GPTSelfAttention(dim=dim,
|
|
num_heads=num_heads,
|
|
attention_dropout=attention_dropout,
|
|
dropout=dropout,
|
|
bias=bias,
|
|
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
|
dtype=dtype)
|
|
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
|
self.mlp = GPTMLP(dim=dim, mlp_ratio=mlp_ratio, activation=activation, dropout=dropout, dtype=dtype, bias=bias)
|
|
|
|
def _forward(self, x, attention_mask=None):
|
|
if not self.apply_post_layernorm:
|
|
residual = x
|
|
x = self.norm1(x)
|
|
if self.apply_post_layernorm:
|
|
residual = x
|
|
x = residual + self.attn(x, attention_mask)
|
|
|
|
if not self.apply_post_layernorm:
|
|
residual = x
|
|
x = self.norm2(x)
|
|
if self.apply_post_layernorm:
|
|
residual = x
|
|
x = residual + self.mlp(x)
|
|
|
|
return x, attention_mask
|
|
|
|
|
|
@LAYERS.register_module
|
|
class GPTLMHead(nn.Module):
|
|
|
|
def __init__(self,
|
|
dim: int,
|
|
vocab_size: int,
|
|
word_embeeding_weight: nn.Parameter = None,
|
|
bias: bool = False,
|
|
dtype: dtype = None) -> None:
|
|
super().__init__()
|
|
self.dense = col_nn.Classifier(dim, vocab_size, word_embeeding_weight, bias=bias, dtype=dtype)
|
|
|
|
@property
|
|
def weight(self):
|
|
return self.dense.weight
|
|
|
|
def forward(self, x):
|
|
x = self.dense(x)
|
|
return x
|
|
|
|
|
|
@LOSSES.register_module
|
|
class GPTLMLoss(nn.Module):
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.loss = col_nn.CrossEntropyLoss()
|
|
|
|
def forward(self, logits, labels):
|
|
shift_logits = logits[..., :-1, :].contiguous()
|
|
shift_labels = labels[..., 1:].contiguous()
|
|
# Flatten the tokens
|
|
return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
|
|
|
|
|
@MODELS.register_module
|
|
class GPT(nn.Module):
|
|
|
|
def __init__(self,
|
|
vocab_size: int = 50304,
|
|
max_position_embeddings: int = 1024,
|
|
dim: int = 768,
|
|
num_heads: int = 12,
|
|
depth: int = 12,
|
|
mlp_ratio: float = 4.0,
|
|
dropout: float = 0.1,
|
|
embedding_dropout: float = 0.1,
|
|
attention_dropout: float = 0.1,
|
|
layernorm_epsilon: float = 1e-5,
|
|
activation: Callable = nn.functional.gelu,
|
|
padding_idx: int = None,
|
|
dtype: dtype = None,
|
|
bias: bool = True,
|
|
apply_post_layernorm: bool = False,
|
|
fuse_scale_mask_softmax: bool = False,
|
|
checkpoint: bool = False,
|
|
activation_offload: bool = False) -> None:
|
|
super().__init__()
|
|
self.embed = GPTEmbedding(embedding_dim=dim,
|
|
vocab_size=vocab_size,
|
|
max_position_embeddings=max_position_embeddings,
|
|
padding_idx=padding_idx,
|
|
dropout=embedding_dropout,
|
|
dtype=dtype)
|
|
self.blocks = nn.ModuleList([
|
|
GPTBlock(
|
|
dim=dim,
|
|
num_heads=num_heads,
|
|
mlp_ratio=mlp_ratio,
|
|
activation=activation,
|
|
attention_dropout=attention_dropout,
|
|
dropout=dropout,
|
|
layernorm_epsilon=layernorm_epsilon,
|
|
dtype=dtype,
|
|
bias=bias,
|
|
apply_post_layernorm=apply_post_layernorm,
|
|
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
|
checkpoint=checkpoint,
|
|
activation_offload=activation_offload
|
|
) for _ in range(depth)
|
|
])
|
|
|
|
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
|
|
|
self.head = GPTLMHead(dim=dim,
|
|
vocab_size=vocab_size,
|
|
word_embeeding_weight=self.embed.word_embedding_weight,
|
|
dtype=dtype)
|
|
|
|
def forward(self, input_ids, attention_mask=None):
|
|
x = self.embed(input_ids)
|
|
|
|
# We create a 3D attention mask from a 2D tensor mask.
|
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
|
# Adapted from huggingface
|
|
if attention_mask is not None:
|
|
batch_size = input_ids.shape[0]
|
|
attention_mask = attention_mask.view(batch_size, -1)
|
|
attention_mask = col_nn.partition_batch(attention_mask)
|
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
|
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
|
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
|
|
|
for block in self.blocks:
|
|
x, attention_mask = block(x, attention_mask)
|
|
|
|
x = self.head(self.norm(x))
|
|
|
|
return x
|
|
|
|
|
|
class PipelineGPT(nn.Module):
|
|
|
|
def __init__(self,
|
|
vocab_size: int = 50304,
|
|
max_position_embeddings: int = 1024,
|
|
dim: int = 768,
|
|
num_heads: int = 12,
|
|
depth: int = 12,
|
|
mlp_ratio: float = 4.0,
|
|
dropout: float = 0.1,
|
|
embedding_dropout: float = 0.1,
|
|
attention_dropout: float = 0.1,
|
|
layernorm_epsilon: float = 1e-5,
|
|
activation: Callable = nn.functional.gelu,
|
|
padding_idx: int = None,
|
|
dtype: dtype = None,
|
|
bias: bool = True,
|
|
apply_post_layernorm: bool = False,
|
|
fuse_scale_mask_softmax: bool = False,
|
|
checkpoint: bool = False,
|
|
first: bool = False,
|
|
last: bool = False):
|
|
super().__init__()
|
|
self.checkpoint = checkpoint
|
|
self.first = first
|
|
self.last = last
|
|
if first:
|
|
self.embed = GPTEmbedding(embedding_dim=dim,
|
|
vocab_size=vocab_size,
|
|
max_position_embeddings=max_position_embeddings,
|
|
padding_idx=padding_idx,
|
|
dropout=embedding_dropout,
|
|
dtype=dtype)
|
|
self.blocks = nn.ModuleList([
|
|
GPTBlock(
|
|
dim=dim,
|
|
num_heads=num_heads,
|
|
mlp_ratio=mlp_ratio,
|
|
activation=activation,
|
|
attention_dropout=attention_dropout,
|
|
dropout=dropout,
|
|
layernorm_epsilon=layernorm_epsilon,
|
|
dtype=dtype,
|
|
bias=bias,
|
|
apply_post_layernorm=apply_post_layernorm,
|
|
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
|
checkpoint=checkpoint,
|
|
) for _ in range(depth)
|
|
])
|
|
if self.last:
|
|
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
|
self.head = GPTLMHead(dim=dim, vocab_size=vocab_size, dtype=dtype)
|
|
|
|
def forward(self, x=None, input_ids=None, attention_mask=None):
|
|
if self.first:
|
|
x = self.embed(input_ids)
|
|
|
|
# We create a 3D attention mask from a 2D tensor mask.
|
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
|
# Adapted from huggingface
|
|
if attention_mask is not None:
|
|
if self.first:
|
|
batch_size = input_ids.shape[0]
|
|
else:
|
|
batch_size = x.shape[0]
|
|
attention_mask = attention_mask.view(batch_size, -1)
|
|
attention_mask = col_nn.partition_batch(attention_mask)
|
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
|
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
|
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
|
|
|
for block in self.blocks:
|
|
x, attention_mask = block(x, attention_mask)
|
|
|
|
if self.last:
|
|
x = self.head(self.norm(x))
|
|
|
|
return x
|
|
|
|
|
|
def _create_gpt_model(**model_kwargs):
|
|
model = GPT(**model_kwargs)
|
|
return model
|
|
|
|
|
|
def _create_gpt_pipeline_model(depth=48, num_chunks=1, layer_partitions=None, **model_kwargs):
|
|
logger = get_dist_logger()
|
|
pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
|
|
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
|
rank = gpc.get_global_rank()
|
|
wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
|
|
parts = partition_uniform(depth, pipeline_size,
|
|
num_chunks)[pipeline_rank] if layer_partitions is None else layer_partitions
|
|
models = []
|
|
for start, end in parts:
|
|
model_kwargs['first'] = start == 0
|
|
model_kwargs['last'] = end == depth
|
|
model_kwargs['depth'] = end - start
|
|
chunk = PipelineGPT(**model_kwargs).to(get_current_device())
|
|
if start == 0:
|
|
wrapper.register_parameter(chunk.embed.word_embedding_weight)
|
|
elif end == depth:
|
|
wrapper.register_parameter(chunk.head.weight)
|
|
models.append(chunk)
|
|
logger.info(f'==> Rank {rank} built layer {start}-{end} / total {depth}')
|
|
if len(models) == 1:
|
|
model = models[0]
|
|
else:
|
|
model = nn.ModuleList(models)
|
|
return model
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_small(**kwargs):
|
|
model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs)
|
|
return _create_gpt_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_medium(**kwargs):
|
|
model_kwargs = dict(dim=1024, depth=24, num_heads=8, **kwargs)
|
|
return _create_gpt_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_large(**kwargs):
|
|
model_kwargs = dict(dim=1536, depth=36, num_heads=12, **kwargs)
|
|
return _create_gpt_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_xl(**kwargs):
|
|
model_kwargs = dict(dim=1600, depth=48, num_heads=16, **kwargs)
|
|
return _create_gpt_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_8B(**kwargs):
|
|
model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
|
|
return _create_gpt_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_xl_pipeline(**kwargs):
|
|
model_kwargs = dict(dim=1600, depth=48, num_heads=20, **kwargs)
|
|
return _create_gpt_pipeline_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt2_8B_pipeline(**kwargs):
|
|
model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
|
|
return _create_gpt_pipeline_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt3(**kwargs):
|
|
model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
|
|
return _create_gpt_model(**model_kwargs)
|
|
|
|
|
|
@MODELS.register_module
|
|
def gpt3_pipeline(**kwargs):
|
|
model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
|
|
return _create_gpt_pipeline_model(**model_kwargs)
|