mirror of https://github.com/hpcaitech/ColossalAI
[titans]remove model zoo (#1042)
* [CLI] add CLI launcher
* Revert "[CLI] add CLI launcher"
This reverts commit df7e6506d4
.
* rm model zoo
pull/1015/head^2
parent
0dac86866b
commit
9feff0f760
|
@ -1 +0,0 @@
|
|||
from .gpt import *
|
|
@ -1,478 +0,0 @@
|
|||
import math
|
||||
from typing import Callable
|
||||
|
||||
import torch
|
||||
from colossalai import nn as col_nn
|
||||
from colossalai.builder.pipeline import partition_uniform
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.layer.utils import CheckpointModule, divide
|
||||
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
|
||||
from colossalai.registry import LAYERS, LOSSES, MODELS
|
||||
from colossalai.utils import get_current_device
|
||||
from torch import dtype, nn
|
||||
|
||||
__all__ = [
|
||||
'GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt2_xl_pipeline',
|
||||
'gpt2_8B_pipeline', 'gpt3', 'gpt3_pipeline'
|
||||
]
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class GPTEmbedding(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
embedding_dim: int,
|
||||
vocab_size: int,
|
||||
max_position_embeddings: int,
|
||||
num_tokentypes: int = 0,
|
||||
padding_idx: int = None,
|
||||
dropout: float = 0.,
|
||||
dtype: dtype = None) -> None:
|
||||
super().__init__()
|
||||
self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype)
|
||||
self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype)
|
||||
if num_tokentypes > 0:
|
||||
self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype)
|
||||
else:
|
||||
self.tokentype_embeddings = None
|
||||
self.dropout = col_nn.Dropout(dropout)
|
||||
|
||||
@property
|
||||
def word_embedding_weight(self):
|
||||
return self.word_embeddings.weight
|
||||
|
||||
def forward(self, input_ids, position_ids=None, tokentype_ids=None):
|
||||
seq_length = input_ids.size(1)
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0)
|
||||
x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids)
|
||||
if self.tokentype_embeddings is not None and tokentype_ids is not None:
|
||||
x = x + self.tokentype_embeddings(tokentype_ids)
|
||||
x = self.dropout(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class GPTSelfAttention(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
num_heads: int,
|
||||
attention_dropout: float,
|
||||
dropout: float,
|
||||
bias: bool = True,
|
||||
fuse_scale_mask_softmax: bool = False,
|
||||
dtype: dtype = None) -> None:
|
||||
super().__init__()
|
||||
self.fuse_scale_mask_softmax = fuse_scale_mask_softmax
|
||||
self.attention_head_size = divide(dim, num_heads)
|
||||
self.query_key_value = col_nn.Linear(dim, 3 * dim, dtype=dtype, bias=bias)
|
||||
if fuse_scale_mask_softmax:
|
||||
from colossalai.kernel import FusedScaleMaskSoftmax
|
||||
from colossalai.kernel.cuda_native.scaled_softmax import \
|
||||
AttnMaskType
|
||||
self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True,
|
||||
input_in_bf16=False,
|
||||
attn_mask_type=AttnMaskType.causal,
|
||||
scaled_masked_softmax_fusion=True,
|
||||
mask_func=None,
|
||||
softmax_in_fp32=True,
|
||||
scale=math.sqrt(self.attention_head_size))
|
||||
else:
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
self.attention_dropout = col_nn.Dropout(attention_dropout)
|
||||
self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True)
|
||||
self.dropout = col_nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x, attention_mask=None):
|
||||
qkv = self.query_key_value(x)
|
||||
q, k, v = torch.chunk(qkv, 3, dim=-1)
|
||||
all_head_size = q.shape[-1]
|
||||
num_attention_heads = divide(all_head_size, self.attention_head_size)
|
||||
new_shape = q.shape[:-1] + \
|
||||
(num_attention_heads, self.attention_head_size)
|
||||
q = q.view(new_shape).permute((0, 2, 1, 3)).contiguous()
|
||||
k = k.view(new_shape).permute((0, 2, 1, 3)).contiguous()
|
||||
v = v.view(new_shape).permute((0, 2, 1, 3)).contiguous()
|
||||
|
||||
x = torch.matmul(q, k.transpose(-1, -2))
|
||||
|
||||
if self.fuse_scale_mask_softmax:
|
||||
x = self.softmax(x, attention_mask)
|
||||
else:
|
||||
x = x / math.sqrt(self.attention_head_size)
|
||||
# causal mask
|
||||
q_len, k_len = q.size(-2), k.size(-2)
|
||||
causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8,
|
||||
device=get_current_device())).view(1, 1, q_len, k_len).bool()
|
||||
x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device()))
|
||||
if attention_mask is not None:
|
||||
x = x + attention_mask
|
||||
x = self.softmax(x)
|
||||
|
||||
x = self.attention_dropout(x)
|
||||
|
||||
x = torch.matmul(x, v)
|
||||
x = x.transpose(1, 2)
|
||||
new_context_layer_shape = x.size()[:-2] + (all_head_size,)
|
||||
x = x.reshape(new_context_layer_shape)
|
||||
|
||||
x = self.dense(x)
|
||||
x = self.dropout(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class GPTMLP(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
mlp_ratio: float,
|
||||
activation: Callable,
|
||||
dropout: float,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True):
|
||||
super().__init__()
|
||||
intermediate_dim = int(dim * mlp_ratio)
|
||||
self.dense_1 = col_nn.Linear(dim, intermediate_dim, dtype=dtype, bias=bias)
|
||||
self.activation = activation
|
||||
self.dense_2 = col_nn.Linear(intermediate_dim, dim, dtype=dtype, bias=bias)
|
||||
self.dropout = col_nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.dense_1(x)
|
||||
x = self.activation(x)
|
||||
x = self.dense_2(x)
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class GPTBlock(CheckpointModule):
|
||||
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
num_heads: int,
|
||||
mlp_ratio: float,
|
||||
activation: Callable,
|
||||
attention_dropout: float = 0.,
|
||||
dropout: float = 0.,
|
||||
layernorm_epsilon: float = 1e-5,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
apply_post_layernorm: bool = False,
|
||||
fuse_scale_mask_softmax: bool = False,
|
||||
checkpoint: bool = False,
|
||||
activation_offload: bool = False):
|
||||
super().__init__(checkpoint, activation_offload)
|
||||
self.apply_post_layernorm = apply_post_layernorm
|
||||
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
self.attn = GPTSelfAttention(dim=dim,
|
||||
num_heads=num_heads,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
bias=bias,
|
||||
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
||||
dtype=dtype)
|
||||
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
self.mlp = GPTMLP(dim=dim, mlp_ratio=mlp_ratio, activation=activation, dropout=dropout, dtype=dtype, bias=bias)
|
||||
|
||||
def _forward(self, x, attention_mask=None):
|
||||
if not self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = self.norm1(x)
|
||||
if self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = residual + self.attn(x, attention_mask)
|
||||
|
||||
if not self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = self.norm2(x)
|
||||
if self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = residual + self.mlp(x)
|
||||
|
||||
return x, attention_mask
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class GPTLMHead(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
vocab_size: int,
|
||||
word_embeeding_weight: nn.Parameter = None,
|
||||
bias: bool = False,
|
||||
dtype: dtype = None) -> None:
|
||||
super().__init__()
|
||||
self.dense = col_nn.Classifier(dim, vocab_size, word_embeeding_weight, bias=bias, dtype=dtype)
|
||||
|
||||
@property
|
||||
def weight(self):
|
||||
return self.dense.weight
|
||||
|
||||
def forward(self, x):
|
||||
x = self.dense(x)
|
||||
return x
|
||||
|
||||
|
||||
@LOSSES.register_module
|
||||
class GPTLMLoss(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.loss = col_nn.CrossEntropyLoss()
|
||||
|
||||
def forward(self, logits, labels):
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
class GPT(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 50304,
|
||||
max_position_embeddings: int = 1024,
|
||||
dim: int = 768,
|
||||
num_heads: int = 12,
|
||||
depth: int = 12,
|
||||
mlp_ratio: float = 4.0,
|
||||
dropout: float = 0.1,
|
||||
embedding_dropout: float = 0.1,
|
||||
attention_dropout: float = 0.1,
|
||||
layernorm_epsilon: float = 1e-5,
|
||||
activation: Callable = nn.functional.gelu,
|
||||
padding_idx: int = None,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
apply_post_layernorm: bool = False,
|
||||
fuse_scale_mask_softmax: bool = False,
|
||||
checkpoint: bool = False,
|
||||
activation_offload: bool = False) -> None:
|
||||
super().__init__()
|
||||
self.embed = GPTEmbedding(embedding_dim=dim,
|
||||
vocab_size=vocab_size,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
padding_idx=padding_idx,
|
||||
dropout=embedding_dropout,
|
||||
dtype=dtype)
|
||||
self.blocks = nn.ModuleList([
|
||||
GPTBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
activation=activation,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
apply_post_layernorm=apply_post_layernorm,
|
||||
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
||||
checkpoint=checkpoint,
|
||||
activation_offload=activation_offload
|
||||
) for _ in range(depth)
|
||||
])
|
||||
|
||||
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
|
||||
self.head = GPTLMHead(dim=dim,
|
||||
vocab_size=vocab_size,
|
||||
word_embeeding_weight=self.embed.word_embedding_weight,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None):
|
||||
x = self.embed(input_ids)
|
||||
|
||||
# We create a 3D attention mask from a 2D tensor mask.
|
||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||
# Adapted from huggingface
|
||||
if attention_mask is not None:
|
||||
batch_size = input_ids.shape[0]
|
||||
attention_mask = attention_mask.view(batch_size, -1)
|
||||
attention_mask = col_nn.partition_batch(attention_mask)
|
||||
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
|
||||
for block in self.blocks:
|
||||
x, attention_mask = block(x, attention_mask)
|
||||
|
||||
x = self.head(self.norm(x))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PipelineGPT(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size: int = 50304,
|
||||
max_position_embeddings: int = 1024,
|
||||
dim: int = 768,
|
||||
num_heads: int = 12,
|
||||
depth: int = 12,
|
||||
mlp_ratio: float = 4.0,
|
||||
dropout: float = 0.1,
|
||||
embedding_dropout: float = 0.1,
|
||||
attention_dropout: float = 0.1,
|
||||
layernorm_epsilon: float = 1e-5,
|
||||
activation: Callable = nn.functional.gelu,
|
||||
padding_idx: int = None,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
apply_post_layernorm: bool = False,
|
||||
fuse_scale_mask_softmax: bool = False,
|
||||
checkpoint: bool = False,
|
||||
first: bool = False,
|
||||
last: bool = False):
|
||||
super().__init__()
|
||||
self.checkpoint = checkpoint
|
||||
self.first = first
|
||||
self.last = last
|
||||
if first:
|
||||
self.embed = GPTEmbedding(embedding_dim=dim,
|
||||
vocab_size=vocab_size,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
padding_idx=padding_idx,
|
||||
dropout=embedding_dropout,
|
||||
dtype=dtype)
|
||||
self.blocks = nn.ModuleList([
|
||||
GPTBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
activation=activation,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
apply_post_layernorm=apply_post_layernorm,
|
||||
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
||||
checkpoint=checkpoint,
|
||||
) for _ in range(depth)
|
||||
])
|
||||
if self.last:
|
||||
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
self.head = GPTLMHead(dim=dim, vocab_size=vocab_size, dtype=dtype)
|
||||
|
||||
def forward(self, x=None, input_ids=None, attention_mask=None):
|
||||
if self.first:
|
||||
x = self.embed(input_ids)
|
||||
|
||||
# We create a 3D attention mask from a 2D tensor mask.
|
||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||
# Adapted from huggingface
|
||||
if attention_mask is not None:
|
||||
if self.first:
|
||||
batch_size = input_ids.shape[0]
|
||||
else:
|
||||
batch_size = x.shape[0]
|
||||
attention_mask = attention_mask.view(batch_size, -1)
|
||||
attention_mask = col_nn.partition_batch(attention_mask)
|
||||
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
|
||||
for block in self.blocks:
|
||||
x, attention_mask = block(x, attention_mask)
|
||||
|
||||
if self.last:
|
||||
x = self.head(self.norm(x))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def _create_gpt_model(**model_kwargs):
|
||||
model = GPT(**model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
def _create_gpt_pipeline_model(depth=48, num_chunks=1, layer_partitions=None, **model_kwargs):
|
||||
logger = get_dist_logger()
|
||||
pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
|
||||
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
rank = gpc.get_global_rank()
|
||||
wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
|
||||
parts = partition_uniform(depth, pipeline_size,
|
||||
num_chunks)[pipeline_rank] if layer_partitions is None else layer_partitions
|
||||
models = []
|
||||
for start, end in parts:
|
||||
model_kwargs['first'] = start == 0
|
||||
model_kwargs['last'] = end == depth
|
||||
model_kwargs['depth'] = end - start
|
||||
chunk = PipelineGPT(**model_kwargs).to(get_current_device())
|
||||
if start == 0:
|
||||
wrapper.register_parameter(chunk.embed.word_embedding_weight)
|
||||
elif end == depth:
|
||||
wrapper.register_parameter(chunk.head.weight)
|
||||
models.append(chunk)
|
||||
logger.info(f'==> Rank {rank} built layer {start}-{end} / total {depth}')
|
||||
if len(models) == 1:
|
||||
model = models[0]
|
||||
else:
|
||||
model = nn.ModuleList(models)
|
||||
return model
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_small(**kwargs):
|
||||
model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs)
|
||||
return _create_gpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_medium(**kwargs):
|
||||
model_kwargs = dict(dim=1024, depth=24, num_heads=8, **kwargs)
|
||||
return _create_gpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_large(**kwargs):
|
||||
model_kwargs = dict(dim=1536, depth=36, num_heads=12, **kwargs)
|
||||
return _create_gpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_xl(**kwargs):
|
||||
model_kwargs = dict(dim=1600, depth=48, num_heads=16, **kwargs)
|
||||
return _create_gpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_8B(**kwargs):
|
||||
model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
|
||||
return _create_gpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_xl_pipeline(**kwargs):
|
||||
model_kwargs = dict(dim=1600, depth=48, num_heads=20, **kwargs)
|
||||
return _create_gpt_pipeline_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt2_8B_pipeline(**kwargs):
|
||||
model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
|
||||
return _create_gpt_pipeline_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt3(**kwargs):
|
||||
model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
|
||||
return _create_gpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def gpt3_pipeline(**kwargs):
|
||||
model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
|
||||
return _create_gpt_pipeline_model(**model_kwargs)
|
|
@ -1,26 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from colossalai.nn.layer import WrappedDropPath as DropPath
|
||||
|
||||
|
||||
class TransformerLayer(nn.Module):
|
||||
"""Transformer layer builder.
|
||||
"""
|
||||
def __init__(self,
|
||||
att: nn.Module,
|
||||
ffn: nn.Module,
|
||||
norm1: nn.Module,
|
||||
norm2: nn.Module,
|
||||
droppath=None,
|
||||
droppath_rate: float = 0):
|
||||
super().__init__()
|
||||
self.att = att
|
||||
self.ffn = ffn
|
||||
self.norm1 = norm1
|
||||
self.norm2 = norm2
|
||||
self.droppath = DropPath(droppath_rate) if droppath is None else droppath
|
||||
|
||||
def forward(self, x):
|
||||
x = x + self.droppath(self.att(self.norm1(x)))
|
||||
x = x + self.droppath(self.ffn(self.norm2(x)))
|
||||
return x
|
|
@ -1,2 +0,0 @@
|
|||
from .models import Widenet, ViTMoE
|
||||
from .gpt import MOEGPT, prmoe_4b, prmoe_31b, prmoe_51b
|
|
@ -1,229 +0,0 @@
|
|||
from typing import Callable, List
|
||||
from torch import dtype, nn
|
||||
from colossalai import nn as col_nn
|
||||
from colossalai.registry import LAYERS, MODELS
|
||||
from colossalai.nn.layer import MoeModule
|
||||
from colossalai.context import MOE_CONTEXT
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn.layer.utils import CheckpointModule, divide
|
||||
from model_zoo.gpt.gpt import GPTEmbedding, GPTSelfAttention, GPTMLP, GPTBlock, GPTLMHead
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class MOEGPTBlock(CheckpointModule):
|
||||
|
||||
def __init__(self,
|
||||
num_experts: int,
|
||||
dim: int,
|
||||
num_heads: int,
|
||||
mlp_ratio: float,
|
||||
activation: Callable,
|
||||
capacity_factor_train: float = 1.0,
|
||||
capacity_factor_eval: float = 1.0,
|
||||
use_residual: bool = False,
|
||||
attention_dropout: float = 0.,
|
||||
dropout: float = 0.,
|
||||
layernorm_epsilon: float = 1e-5,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
apply_post_layernorm: bool = False,
|
||||
fuse_scale_mask_softmax: bool = False,
|
||||
checkpoint: bool = False):
|
||||
super().__init__(checkpoint)
|
||||
self.apply_post_layernorm = apply_post_layernorm
|
||||
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
self.attn = GPTSelfAttention(dim=dim,
|
||||
num_heads=num_heads,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
bias=bias,
|
||||
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
||||
dtype=dtype)
|
||||
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
|
||||
mpl_factory_dict = dict(dim=dim,
|
||||
mlp_ratio=mlp_ratio,
|
||||
activation=activation,
|
||||
dropout=dropout,
|
||||
dtype=dtype,
|
||||
bias=bias)
|
||||
|
||||
self.mlp = MoeModule(dim_model=dim,
|
||||
num_experts=num_experts,
|
||||
top_k=1,
|
||||
capacity_factor_train=capacity_factor_train,
|
||||
capacity_factor_eval=capacity_factor_eval,
|
||||
noisy_policy='Jitter',
|
||||
use_residual=use_residual,
|
||||
expert_cls=GPTMLP,
|
||||
**mpl_factory_dict)
|
||||
|
||||
def _forward(self, x, attention_mask=None):
|
||||
if not self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = self.norm1(x)
|
||||
if self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = residual + self.attn(x, attention_mask)
|
||||
|
||||
if not self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = self.norm2(x)
|
||||
if self.apply_post_layernorm:
|
||||
residual = x
|
||||
x = residual + self.mlp(x)
|
||||
|
||||
return x, attention_mask
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
class MOEGPT(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
num_experts: int or List[int],
|
||||
use_residual: bool = False,
|
||||
capacity_factor_train: float = 1.0,
|
||||
capacity_factor_eval: float = 1.0,
|
||||
vocab_size: int = 50304,
|
||||
max_position_embeddings: int = 1024,
|
||||
dim: int = 768,
|
||||
num_heads: int = 12,
|
||||
depth: int = 12,
|
||||
mlp_ratio: float = 4.0,
|
||||
dropout: float = 0.1,
|
||||
embedding_dropout: float = 0.1,
|
||||
attention_dropout: float = 0.1,
|
||||
layernorm_epsilon: float = 1e-5,
|
||||
activation: Callable = nn.functional.gelu,
|
||||
padding_idx: int = None,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
apply_post_layernorm: bool = False,
|
||||
fuse_scale_mask_softmax: bool = False,
|
||||
checkpoint: bool = False) -> None:
|
||||
super().__init__()
|
||||
|
||||
half_depth = divide(depth, 2)
|
||||
if isinstance(num_experts, list):
|
||||
assert len(num_experts) == half_depth, \
|
||||
"The length of num_experts should equal to the number of MOE layers"
|
||||
num_experts_list = num_experts
|
||||
else:
|
||||
num_experts_list = [num_experts] * half_depth
|
||||
|
||||
self.embed = GPTEmbedding(embedding_dim=dim,
|
||||
vocab_size=vocab_size,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
padding_idx=padding_idx,
|
||||
dropout=embedding_dropout,
|
||||
dtype=dtype)
|
||||
|
||||
block_list = []
|
||||
block_factory_dict = dict(dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
activation=activation,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
layernorm_epsilon=layernorm_epsilon,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
apply_post_layernorm=apply_post_layernorm,
|
||||
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
|
||||
checkpoint=checkpoint)
|
||||
|
||||
for i in range(depth):
|
||||
|
||||
if i % 2 == 0:
|
||||
block_module = GPTBlock(**block_factory_dict)
|
||||
else:
|
||||
num_experts = num_experts_list[i // 2]
|
||||
block_module = MOEGPTBlock(num_experts=num_experts,
|
||||
capacity_factor_train=capacity_factor_train,
|
||||
capacity_factor_eval=capacity_factor_eval,
|
||||
use_residual=use_residual,
|
||||
**block_factory_dict)
|
||||
|
||||
block_list.append(block_module)
|
||||
|
||||
self.blocks = nn.ModuleList(block_list)
|
||||
|
||||
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
|
||||
self.head = GPTLMHead(dim=dim,
|
||||
vocab_size=vocab_size,
|
||||
word_embeeding_weight=self.embed.word_embedding_weight,
|
||||
dtype=dtype)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None):
|
||||
MOE_CONTEXT.reset_loss()
|
||||
x = self.embed(input_ids)
|
||||
|
||||
# We create a 3D attention mask from a 2D tensor mask.
|
||||
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||
# Adapted from huggingface
|
||||
if attention_mask is not None:
|
||||
batch_size = input_ids.shape[0]
|
||||
attention_mask = attention_mask.view(batch_size, -1)
|
||||
attention_mask = col_nn.partition_batch(attention_mask)
|
||||
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
|
||||
for block in self.blocks:
|
||||
x, attention_mask = block(x, attention_mask)
|
||||
|
||||
x = self.head(self.norm(x))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def _create_moegpt_model(**model_kwargs):
|
||||
model = MOEGPT(**model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
def _prmoe_check_sanity(kwargs_dict):
|
||||
logger = get_dist_logger()
|
||||
if not kwargs_dict.pop('use_residual', False):
|
||||
logger.warning(
|
||||
"If you want to use PR-MOE, please set 'use_residual' to True. "
|
||||
"Otherwise, we'll force 'use_residual' to True.",
|
||||
ranks=[0])
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def prmoe_4b(**kwargs):
|
||||
_prmoe_check_sanity(kwargs)
|
||||
model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64],
|
||||
use_residual=True,
|
||||
dim=1024,
|
||||
depth=24,
|
||||
num_heads=16,
|
||||
**kwargs)
|
||||
return _create_moegpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def prmoe_31b(**kwargs):
|
||||
_prmoe_check_sanity(kwargs)
|
||||
model_kwargs = dict(num_experts=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128],
|
||||
use_residual=True,
|
||||
dim=2048,
|
||||
depth=24,
|
||||
num_heads=16,
|
||||
**kwargs)
|
||||
return _create_moegpt_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def prmoe_51b(**kwargs):
|
||||
_prmoe_check_sanity(kwargs)
|
||||
model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64],
|
||||
use_residual=True,
|
||||
dim=3072,
|
||||
depth=32,
|
||||
num_heads=24,
|
||||
**kwargs)
|
||||
return _create_moegpt_model(**model_kwargs)
|
|
@ -1,226 +0,0 @@
|
|||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \
|
||||
WrappedDropout as Dropout, WrappedDropPath as DropPath
|
||||
from colossalai.nn.layer.moe import build_ffn_experts, MoeLayer, Top2Router, NormalNoiseGenerator, MoeModule
|
||||
from .util import moe_sa_args, moe_mlp_args
|
||||
from ..helper import TransformerLayer
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.utils import get_current_device
|
||||
from typing import List
|
||||
|
||||
|
||||
class VanillaSelfAttention(nn.Module):
|
||||
"""Standard ViT self attention.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_model: int,
|
||||
n_heads: int,
|
||||
d_kv: int,
|
||||
attention_drop: float = 0,
|
||||
drop_rate: float = 0,
|
||||
bias: bool = True,
|
||||
dropout1=None,
|
||||
dropout2=None):
|
||||
super().__init__()
|
||||
self.n_heads = n_heads
|
||||
self.d_kv = d_kv
|
||||
self.scale = 1.0 / math.sqrt(self.d_kv)
|
||||
|
||||
self.dense1 = nn.Linear(d_model, 3 * n_heads * d_kv, bias, device=get_current_device())
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
self.atten_drop = nn.Dropout(attention_drop) if dropout1 is None else dropout1
|
||||
self.dense2 = nn.Linear(n_heads * d_kv, d_model, device=get_current_device())
|
||||
self.dropout = nn.Dropout(drop_rate) if dropout2 is None else dropout2
|
||||
|
||||
def forward(self, x):
|
||||
qkv = self.dense1(x)
|
||||
new_shape = qkv.shape[:2] + (3, self.n_heads, self.d_kv)
|
||||
qkv = qkv.view(*new_shape)
|
||||
qkv = qkv.permute(2, 0, 3, 1, 4)
|
||||
q, k, v = qkv[:]
|
||||
|
||||
x = torch.matmul(q, k.transpose(-2, -1)) * self.scale
|
||||
x = self.atten_drop(self.softmax(x))
|
||||
|
||||
x = torch.matmul(x, v)
|
||||
x = x.transpose(1, 2)
|
||||
new_shape = x.shape[:2] + (self.n_heads * self.d_kv,)
|
||||
x = x.reshape(*new_shape)
|
||||
x = self.dense2(x)
|
||||
x = self.dropout(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class VanillaFFN(nn.Module):
|
||||
"""FFN composed with two linear layers, also called MLP.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
d_model: int,
|
||||
d_ff: int,
|
||||
activation=None,
|
||||
drop_rate: float = 0,
|
||||
bias: bool = True,
|
||||
dropout1=None,
|
||||
dropout2=None):
|
||||
super().__init__()
|
||||
dense1 = nn.Linear(d_model, d_ff, bias, device=get_current_device())
|
||||
act = nn.GELU() if activation is None else activation
|
||||
dense2 = nn.Linear(d_ff, d_model, bias, device=get_current_device())
|
||||
drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1
|
||||
drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2
|
||||
|
||||
self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2)
|
||||
|
||||
def forward(self, x):
|
||||
return self.ffn(x)
|
||||
|
||||
|
||||
class Widenet(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
num_experts: int,
|
||||
capacity_factor_train: float = 1.25,
|
||||
capacity_factor_eval: float = 2.0,
|
||||
drop_tks: bool = True,
|
||||
img_size: int = 224,
|
||||
patch_size: int = 16,
|
||||
in_chans: int = 3,
|
||||
num_classes: int = 1000,
|
||||
depth: int = 12,
|
||||
d_model: int = 768,
|
||||
num_heads: int = 12,
|
||||
d_kv: int = 64,
|
||||
d_ff: int = 4096,
|
||||
attention_drop: float = 0.,
|
||||
drop_rate: float = 0.1,
|
||||
drop_path: float = 0.):
|
||||
super().__init__()
|
||||
|
||||
embedding = VanillaPatchEmbedding(img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_size=d_model)
|
||||
embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
|
||||
|
||||
shared_sa = VanillaSelfAttention(**moe_sa_args(
|
||||
d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
|
||||
|
||||
noisy_func = NormalNoiseGenerator(num_experts)
|
||||
shared_router = Top2Router(capacity_factor_train=capacity_factor_train,
|
||||
capacity_factor_eval=capacity_factor_eval,
|
||||
noisy_func=noisy_func,
|
||||
drop_tks=drop_tks)
|
||||
shared_experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate)
|
||||
|
||||
# stochastic depth decay rule
|
||||
dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
|
||||
blocks = [
|
||||
TransformerLayer(att=shared_sa,
|
||||
ffn=MoeLayer(dim_model=d_model,
|
||||
num_experts=num_experts,
|
||||
router=shared_router,
|
||||
experts=shared_experts),
|
||||
norm1=nn.LayerNorm(d_model, eps=1e-6),
|
||||
norm2=nn.LayerNorm(d_model, eps=1e-6),
|
||||
droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) for i in range(depth)
|
||||
]
|
||||
norm = nn.LayerNorm(d_model, eps=1e-6)
|
||||
self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes)
|
||||
nn.init.zeros_(self.linear.weight)
|
||||
nn.init.zeros_(self.linear.bias)
|
||||
self.widenet = nn.Sequential(embedding, embed_dropout, *blocks, norm)
|
||||
|
||||
def forward(self, x):
|
||||
MOE_CONTEXT.reset_loss()
|
||||
x = self.widenet(x)
|
||||
x = torch.mean(x, dim=1)
|
||||
x = self.linear(x)
|
||||
return x
|
||||
|
||||
|
||||
class ViTMoE(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
num_experts: int or List[int],
|
||||
use_residual: bool = False,
|
||||
capacity_factor_train: float = 1.25,
|
||||
capacity_factor_eval: float = 2.0,
|
||||
drop_tks: bool = True,
|
||||
img_size: int = 224,
|
||||
patch_size: int = 16,
|
||||
in_chans: int = 3,
|
||||
num_classes: int = 1000,
|
||||
depth: int = 12,
|
||||
d_model: int = 768,
|
||||
num_heads: int = 12,
|
||||
d_kv: int = 64,
|
||||
d_ff: int = 3072,
|
||||
attention_drop: float = 0.,
|
||||
drop_rate: float = 0.1,
|
||||
drop_path: float = 0.):
|
||||
super().__init__()
|
||||
|
||||
assert depth % 2 == 0, "The number of layers should be even right now"
|
||||
|
||||
if isinstance(num_experts, list):
|
||||
assert len(num_experts) == depth // 2, \
|
||||
"The length of num_experts should equal to the number of MOE layers"
|
||||
num_experts_list = num_experts
|
||||
else:
|
||||
num_experts_list = [num_experts] * (depth // 2)
|
||||
|
||||
embedding = VanillaPatchEmbedding(img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_size=d_model)
|
||||
embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
|
||||
|
||||
# stochastic depth decay rule
|
||||
dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
|
||||
blocks = []
|
||||
for i in range(depth):
|
||||
sa = VanillaSelfAttention(**moe_sa_args(
|
||||
d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
|
||||
|
||||
if i % 2 == 0:
|
||||
ffn = VanillaFFN(**moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate))
|
||||
else:
|
||||
num_experts = num_experts_list[i // 2]
|
||||
experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate)
|
||||
ffn = MoeModule(dim_model=d_model,
|
||||
num_experts=num_experts,
|
||||
top_k=1 if use_residual else 2,
|
||||
capacity_factor_train=capacity_factor_train,
|
||||
capacity_factor_eval=capacity_factor_eval,
|
||||
noisy_policy='Jitter' if use_residual else 'Gaussian',
|
||||
drop_tks=drop_tks,
|
||||
use_residual=use_residual,
|
||||
expert_instance=experts,
|
||||
expert_cls=VanillaFFN,
|
||||
**moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate))
|
||||
|
||||
layer = TransformerLayer(att=sa,
|
||||
ffn=ffn,
|
||||
norm1=nn.LayerNorm(d_model, eps=1e-6),
|
||||
norm2=nn.LayerNorm(d_model, eps=1e-6),
|
||||
droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR))
|
||||
blocks.append(layer)
|
||||
|
||||
norm = nn.LayerNorm(d_model, eps=1e-6)
|
||||
self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes)
|
||||
nn.init.zeros_(self.linear.weight)
|
||||
nn.init.zeros_(self.linear.bias)
|
||||
self.vitmoe = nn.Sequential(embedding, embed_dropout, *blocks, norm)
|
||||
|
||||
def forward(self, x):
|
||||
MOE_CONTEXT.reset_loss()
|
||||
x = self.vitmoe(x)
|
||||
x = torch.mean(x, dim=1)
|
||||
x = self.linear(x)
|
||||
return x
|
|
@ -1,41 +0,0 @@
|
|||
from colossalai.context import ParallelMode
|
||||
from colossalai.nn.layer import WrappedDropout as Dropout
|
||||
|
||||
|
||||
def moe_sa_args(d_model: int,
|
||||
n_heads: int,
|
||||
d_kv: int,
|
||||
attention_drop: float = 0,
|
||||
drop_rate: float = 0,
|
||||
bias: bool = True):
|
||||
"""This is an example for args in moe self attention, since lots of modules should be
|
||||
adapted before putting them in experts.
|
||||
"""
|
||||
dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR)
|
||||
dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
|
||||
return dict(
|
||||
d_model=d_model,
|
||||
n_heads=n_heads,
|
||||
d_kv=d_kv,
|
||||
bias=bias,
|
||||
dropout1=dropout1,
|
||||
dropout2=dropout2
|
||||
)
|
||||
|
||||
|
||||
def moe_mlp_args(d_model: int,
|
||||
d_ff: int,
|
||||
drop_rate: float,
|
||||
bias: bool = True):
|
||||
"""This is an example for args of MLP in Experts, since lots of modules should be adapted
|
||||
before putting them in experts.
|
||||
"""
|
||||
dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
|
||||
dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
|
||||
return dict(
|
||||
d_model=d_model,
|
||||
d_ff=d_ff,
|
||||
bias=bias,
|
||||
dropout1=dropout1,
|
||||
dropout2=dropout2
|
||||
)
|
|
@ -1 +0,0 @@
|
|||
from .vit import *
|
|
@ -1,87 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import torch
|
||||
|
||||
from colossalai.registry import MODELS
|
||||
from colossalai.nn.model.model_from_config import ModelFromConfig
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
class VisionTransformerFromConfig(ModelFromConfig):
|
||||
"""Vision Transformer from
|
||||
`"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/pdf/2010.11929>`_.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
embedding_cfg: dict,
|
||||
norm_cfg: dict,
|
||||
block_cfg: dict,
|
||||
head_cfg: dict,
|
||||
token_fusion_cfg: dict = None,
|
||||
embed_dim=768,
|
||||
depth=12,
|
||||
drop_path_rate=0.,
|
||||
tensor_splitting_cfg: dict = None):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_tokens = 1
|
||||
self.tensor_splitting_cfg = tensor_splitting_cfg
|
||||
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
|
||||
] # stochastic depth decay rule
|
||||
if token_fusion_cfg is None:
|
||||
token_fusion_cfg = []
|
||||
else:
|
||||
token_fusion_cfg = [token_fusion_cfg]
|
||||
|
||||
self.layers_cfg = [
|
||||
embedding_cfg,
|
||||
|
||||
# input tensor splitting
|
||||
*self._generate_tensor_splitting_cfg(),
|
||||
*token_fusion_cfg,
|
||||
|
||||
# blocks
|
||||
*self._generate_block_cfg(
|
||||
dpr=dpr, block_cfg=block_cfg, depth=depth),
|
||||
|
||||
# norm
|
||||
norm_cfg,
|
||||
|
||||
# head
|
||||
head_cfg
|
||||
]
|
||||
|
||||
def _fuse_tokens(self, x):
|
||||
cls_token = self.cls_token.expand(x.shape[0], -1, -1)
|
||||
x = torch.cat((cls_token, x), dim=1)
|
||||
return x
|
||||
|
||||
def _generate_block_cfg(self, dpr, depth, block_cfg):
|
||||
blocks_cfg = []
|
||||
|
||||
for i in range(depth):
|
||||
_cfg = block_cfg.copy()
|
||||
_cfg['droppath_cfg']['drop_path'] = dpr[i]
|
||||
blocks_cfg.append(_cfg)
|
||||
|
||||
return blocks_cfg
|
||||
|
||||
def _generate_tensor_splitting_cfg(self):
|
||||
if self.tensor_splitting_cfg:
|
||||
return [self.tensor_splitting_cfg]
|
||||
else:
|
||||
return []
|
||||
|
||||
def forward(self, x): # [512, 3, 32, 32]
|
||||
for layer in self.layers:
|
||||
if isinstance(x, tuple):
|
||||
x = layer(*x)
|
||||
else:
|
||||
x = layer(x)
|
||||
return x # [256, 5]
|
||||
|
||||
def init_weights(self):
|
||||
# TODO: add init weights
|
||||
pass
|
|
@ -1,415 +0,0 @@
|
|||
import math
|
||||
from typing import Callable
|
||||
|
||||
import torch
|
||||
from colossalai import nn as col_nn
|
||||
from colossalai.nn.layer.utils import CheckpointModule
|
||||
from colossalai.registry import LAYERS, MODELS
|
||||
from torch import dtype, nn
|
||||
|
||||
__all__ = [
|
||||
'VisionTransformer',
|
||||
'vit_lite_depth7_patch4_32',
|
||||
'vit_tiny_patch4_32',
|
||||
'vit_tiny_patch16_224',
|
||||
'vit_tiny_patch16_384',
|
||||
'vit_small_patch16_224',
|
||||
'vit_small_patch16_384',
|
||||
'vit_small_patch32_224',
|
||||
'vit_small_patch32_384',
|
||||
'vit_base_patch16_224',
|
||||
'vit_base_patch16_384',
|
||||
'vit_base_patch32_224',
|
||||
'vit_base_patch32_384',
|
||||
'vit_large_patch16_224',
|
||||
'vit_large_patch16_384',
|
||||
'vit_large_patch32_224',
|
||||
'vit_large_patch32_384',
|
||||
]
|
||||
|
||||
_init_rules = dict(
|
||||
torch=dict(
|
||||
embed=dict(
|
||||
weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
|
||||
bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
|
||||
position_embed_initializer=col_nn.init.zeros_(),
|
||||
),
|
||||
transformer=dict(
|
||||
weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
|
||||
bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
|
||||
),
|
||||
head=dict(
|
||||
weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
|
||||
bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
|
||||
),
|
||||
),
|
||||
jax=dict(
|
||||
embed=dict(
|
||||
weight_initializer=col_nn.init.lecun_normal_(),
|
||||
bias_initializer=col_nn.init.zeros_(),
|
||||
position_embed_initializer=col_nn.init.trunc_normal_(std=.02),
|
||||
),
|
||||
transformer=dict(
|
||||
weight_initializer=col_nn.init.xavier_uniform_(),
|
||||
bias_initializer=col_nn.init.normal_(std=1e-6),
|
||||
),
|
||||
head=dict(
|
||||
weight_initializer=col_nn.init.zeros_(),
|
||||
bias_initializer=col_nn.init.zeros_(),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class ViTEmbedding(nn.Module):
|
||||
def __init__(self,
|
||||
img_size: int,
|
||||
patch_size: int,
|
||||
in_chans: int,
|
||||
embedding_dim: int,
|
||||
dropout: float,
|
||||
dtype: dtype = None,
|
||||
flatten: bool = True,
|
||||
init_method: str = 'torch'):
|
||||
super().__init__()
|
||||
self.patch_embed = col_nn.PatchEmbedding(img_size,
|
||||
patch_size,
|
||||
in_chans,
|
||||
embedding_dim,
|
||||
dtype=dtype,
|
||||
flatten=flatten,
|
||||
**_init_rules[init_method]['embed'])
|
||||
self.dropout = col_nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.patch_embed(x)
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class ViTSelfAttention(nn.Module):
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
num_heads: int,
|
||||
attention_dropout: float,
|
||||
dropout: float,
|
||||
bias: bool = True,
|
||||
dtype: dtype = None,
|
||||
init_method: str = 'torch'):
|
||||
super().__init__()
|
||||
self.attention_head_size = dim // num_heads
|
||||
self.query_key_value = col_nn.Linear(dim,
|
||||
3 * dim,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
**_init_rules[init_method]['transformer'])
|
||||
self.attention_dropout = col_nn.Dropout(attention_dropout)
|
||||
self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True, **_init_rules[init_method]['transformer'])
|
||||
self.dropout = col_nn.Dropout(dropout)
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
|
||||
def forward(self, x):
|
||||
qkv = self.query_key_value(x)
|
||||
all_head_size = qkv.shape[-1] // 3
|
||||
num_attention_heads = all_head_size // self.attention_head_size
|
||||
new_qkv_shape = qkv.shape[:-1] + \
|
||||
(num_attention_heads, 3 * self.attention_head_size)
|
||||
qkv = qkv.view(new_qkv_shape)
|
||||
qkv = qkv.permute((0, 2, 1, 3))
|
||||
q, k, v = torch.chunk(qkv, 3, dim=-1)
|
||||
|
||||
x = torch.matmul(q, k.transpose(-1, -2))
|
||||
x = x / math.sqrt(self.attention_head_size)
|
||||
x = self.softmax(x)
|
||||
x = self.attention_dropout(x)
|
||||
|
||||
x = torch.matmul(x, v)
|
||||
x = x.transpose(1, 2)
|
||||
new_context_layer_shape = x.size()[:-2] + (all_head_size, )
|
||||
x = x.reshape(new_context_layer_shape)
|
||||
|
||||
x = self.dense(x)
|
||||
x = self.dropout(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class ViTMLP(nn.Module):
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
mlp_ratio: int,
|
||||
activation: Callable,
|
||||
dropout: float,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
init_method: str = 'torch'):
|
||||
super().__init__()
|
||||
self.dense_1 = col_nn.Linear(dim,
|
||||
mlp_ratio * dim,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
**_init_rules[init_method]['transformer'])
|
||||
self.activation = activation
|
||||
self.dropout_1 = col_nn.Dropout(dropout)
|
||||
self.dense_2 = col_nn.Linear(mlp_ratio * dim,
|
||||
dim,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
**_init_rules[init_method]['transformer'])
|
||||
self.dropout_2 = col_nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.dense_1(x)
|
||||
x = self.activation(x)
|
||||
x = self.dropout_1(x)
|
||||
x = self.dense_2(x)
|
||||
x = self.dropout_2(x)
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class ViTHead(nn.Module):
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
num_classes: int,
|
||||
representation_size: int = None,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
init_method: str = 'torch'):
|
||||
super().__init__()
|
||||
if representation_size:
|
||||
self.representation = col_nn.Linear(dim,
|
||||
representation_size,
|
||||
bias=bias,
|
||||
dtype=dtype,
|
||||
**_init_rules[init_method]['head'])
|
||||
else:
|
||||
self.representation = None
|
||||
representation_size = dim
|
||||
|
||||
self.dense = col_nn.Classifier(representation_size,
|
||||
num_classes,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
**_init_rules[init_method]['head'])
|
||||
|
||||
def forward(self, x):
|
||||
x = x[:, 0]
|
||||
if self.representation is not None:
|
||||
x = self.representation(x)
|
||||
x = self.dense(x)
|
||||
return x
|
||||
|
||||
|
||||
@LAYERS.register_module
|
||||
class ViTBlock(CheckpointModule):
|
||||
def __init__(self,
|
||||
dim: int,
|
||||
num_heads: int,
|
||||
mlp_ratio: int,
|
||||
activation: Callable,
|
||||
attention_dropout: float = 0.,
|
||||
dropout: float = 0.,
|
||||
drop_path: float = 0.,
|
||||
layernorm_epsilon: float = 1e-6,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
checkpoint: bool = False,
|
||||
init_method: str = 'torch'):
|
||||
super().__init__(checkpoint)
|
||||
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
self.attn = ViTSelfAttention(dim=dim,
|
||||
num_heads=num_heads,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
bias=bias,
|
||||
dtype=dtype,
|
||||
init_method=init_method)
|
||||
self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
||||
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
self.mlp = ViTMLP(dim=dim,
|
||||
mlp_ratio=mlp_ratio,
|
||||
activation=activation,
|
||||
dropout=dropout,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
init_method=init_method)
|
||||
|
||||
def _forward(self, x):
|
||||
x = x + self.drop_path(self.attn(self.norm1(x)))
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
class VisionTransformer(nn.Module):
|
||||
def __init__(self,
|
||||
img_size: int = 224,
|
||||
patch_size: int = 16,
|
||||
in_chans: int = 3,
|
||||
num_classes: int = 1000,
|
||||
depth: int = 12,
|
||||
num_heads: int = 12,
|
||||
dim: int = 768,
|
||||
mlp_ratio: int = 4,
|
||||
attention_dropout: float = 0.,
|
||||
dropout: float = 0.1,
|
||||
drop_path: float = 0.,
|
||||
layernorm_epsilon: float = 1e-6,
|
||||
activation: Callable = nn.functional.gelu,
|
||||
representation_size: int = None,
|
||||
dtype: dtype = None,
|
||||
bias: bool = True,
|
||||
checkpoint: bool = False,
|
||||
init_method: str = 'torch'):
|
||||
super().__init__()
|
||||
|
||||
embed = ViTEmbedding(img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embedding_dim=dim,
|
||||
dropout=dropout,
|
||||
dtype=dtype,
|
||||
init_method=init_method)
|
||||
|
||||
# stochastic depth decay rule
|
||||
dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
|
||||
blocks = [
|
||||
ViTBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
drop_path=dpr[i],
|
||||
activation=activation,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
checkpoint=checkpoint,
|
||||
init_method=init_method,
|
||||
) for i in range(depth)
|
||||
]
|
||||
|
||||
norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
|
||||
|
||||
head = ViTHead(dim=dim,
|
||||
num_classes=num_classes,
|
||||
representation_size=representation_size,
|
||||
dtype=dtype,
|
||||
bias=bias,
|
||||
init_method=init_method)
|
||||
|
||||
self.layers = nn.Sequential(
|
||||
embed,
|
||||
*blocks,
|
||||
norm,
|
||||
head,
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.layers(x)
|
||||
return x
|
||||
|
||||
|
||||
def _create_vit_model(**model_kwargs):
|
||||
model = VisionTransformer(**model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_lite_depth7_patch4_32(**kwargs):
|
||||
model_kwargs = dict(img_size=32, patch_size=4, dim=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_tiny_patch4_32(**kwargs):
|
||||
model_kwargs = dict(img_size=32, patch_size=4, dim=512, depth=6, num_heads=8, mlp_ratio=1, num_classes=10, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_tiny_patch16_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_tiny_patch16_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_small_patch16_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_small_patch16_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_small_patch32_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_small_patch32_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_base_patch16_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_base_patch16_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_base_patch32_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_base_patch32_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_large_patch16_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_large_patch16_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_large_patch32_224(**kwargs):
|
||||
model_kwargs = dict(img_size=224, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
||||
|
||||
|
||||
@MODELS.register_module
|
||||
def vit_large_patch32_384(**kwargs):
|
||||
model_kwargs = dict(img_size=384, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
|
||||
return _create_vit_model(**model_kwargs)
|
|
@ -1,3 +1,4 @@
|
|||
pytest
|
||||
torchvision
|
||||
transformers
|
||||
titans
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -6,19 +7,21 @@ import colossalai
|
|||
import pytest
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
from colossalai.amp.amp_type import AMP_TYPE
|
||||
from colossalai.builder import build_pipeline_model
|
||||
from colossalai.engine.schedule import PipelineSchedule
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import LinearWarmupLR
|
||||
from colossalai.nn.loss import CrossEntropyLoss
|
||||
from colossalai.amp import AMP_TYPE
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import free_port, get_dataloader
|
||||
from colossalai.engine.gradient_accumulation import GradAccumLrSchedulerByStep
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.testing import rerun_if_address_is_in_use
|
||||
from model_zoo.vit import vit_tiny_patch4_32
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CrossEntropyLoss
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.utils import is_using_pp, get_dataloader
|
||||
from colossalai.utils.model.pipelinable import PipelinableContext
|
||||
from tqdm import tqdm
|
||||
|
||||
from titans.dataloader.cifar10 import build_cifar
|
||||
from titans.model.vit import vit_tiny_patch4_32
|
||||
|
||||
BATCH_SIZE = 4
|
||||
NUM_EPOCHS = 60
|
||||
|
@ -34,35 +37,35 @@ def run_trainer(rank, world_size, port):
|
|||
|
||||
logger = get_dist_logger()
|
||||
|
||||
model = vit_tiny_patch4_32()
|
||||
pipe_model = build_pipeline_model(model.layers, num_chunks=1)
|
||||
# get logger
|
||||
logger = get_dist_logger()
|
||||
|
||||
# build dataloaders
|
||||
transform_train = transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
|
||||
])
|
||||
pipelinable = PipelinableContext()
|
||||
with pipelinable:
|
||||
model = vit_tiny_patch4_32()
|
||||
pipelinable.to_layer_list()
|
||||
pipelinable.load_policy("uniform")
|
||||
model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
|
||||
|
||||
train_dataset = CIFAR10(root=Path(os.environ['DATA']), train=True, download=True, transform=transform_train)
|
||||
train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True)
|
||||
# craete dataloaders
|
||||
root = Path(os.environ['DATA'])
|
||||
train_dataloader, test_dataloader = build_cifar(BATCH_SIZE, root, pad_if_needed=True, crop=32, resize=32)
|
||||
|
||||
# build criterion
|
||||
criterion = CrossEntropyLoss()
|
||||
# create loss function
|
||||
criterion = CrossEntropyLoss(label_smoothing=0.1)
|
||||
|
||||
# optimizer
|
||||
optimizer = torch.optim.Adam(pipe_model.parameters(), lr=0.001, weight_decay=0)
|
||||
# create optimizer
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)
|
||||
|
||||
# lr_scheduler
|
||||
steps_per_epoch = GradAccumLrSchedulerByStep.compute_effective_steps_per_epoch(train_dataloader, accumulate_size=2)
|
||||
total_steps = steps_per_epoch * NUM_EPOCHS
|
||||
warmup_steps = steps_per_epoch * WARMUP_EPOCHS
|
||||
lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps)
|
||||
# create lr scheduler
|
||||
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
|
||||
|
||||
engine, train_dataloader, _, lr_scheduler = colossalai.initialize(pipe_model,
|
||||
optimizer,
|
||||
criterion,
|
||||
train_dataloader,
|
||||
lr_scheduler=lr_scheduler)
|
||||
# intiailize
|
||||
engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
train_dataloader=train_dataloader,
|
||||
test_dataloader=test_dataloader)
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
|
Loading…
Reference in New Issue