[titans]remove model zoo (#1042)

* [CLI] add CLI launcher

* Revert "[CLI] add CLI launcher"

This reverts commit df7e6506d4.

* rm model zoo
pull/1015/head^2
YuliangLiu0306 2022-05-31 10:40:47 +08:00 committed by GitHub
parent 0dac86866b
commit 9feff0f760
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 38 additions and 1540 deletions

View File

View File

@ -1 +0,0 @@
from .gpt import *

View File

@ -1,478 +0,0 @@
import math
from typing import Callable
import torch
from colossalai import nn as col_nn
from colossalai.builder.pipeline import partition_uniform
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn.layer.utils import CheckpointModule, divide
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
from colossalai.registry import LAYERS, LOSSES, MODELS
from colossalai.utils import get_current_device
from torch import dtype, nn
__all__ = [
'GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt2_xl_pipeline',
'gpt2_8B_pipeline', 'gpt3', 'gpt3_pipeline'
]
@LAYERS.register_module
class GPTEmbedding(nn.Module):
def __init__(self,
embedding_dim: int,
vocab_size: int,
max_position_embeddings: int,
num_tokentypes: int = 0,
padding_idx: int = None,
dropout: float = 0.,
dtype: dtype = None) -> None:
super().__init__()
self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype)
self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype)
if num_tokentypes > 0:
self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype)
else:
self.tokentype_embeddings = None
self.dropout = col_nn.Dropout(dropout)
@property
def word_embedding_weight(self):
return self.word_embeddings.weight
def forward(self, input_ids, position_ids=None, tokentype_ids=None):
seq_length = input_ids.size(1)
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0)
x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids)
if self.tokentype_embeddings is not None and tokentype_ids is not None:
x = x + self.tokentype_embeddings(tokentype_ids)
x = self.dropout(x)
return x
@LAYERS.register_module
class GPTSelfAttention(nn.Module):
def __init__(self,
dim: int,
num_heads: int,
attention_dropout: float,
dropout: float,
bias: bool = True,
fuse_scale_mask_softmax: bool = False,
dtype: dtype = None) -> None:
super().__init__()
self.fuse_scale_mask_softmax = fuse_scale_mask_softmax
self.attention_head_size = divide(dim, num_heads)
self.query_key_value = col_nn.Linear(dim, 3 * dim, dtype=dtype, bias=bias)
if fuse_scale_mask_softmax:
from colossalai.kernel import FusedScaleMaskSoftmax
from colossalai.kernel.cuda_native.scaled_softmax import \
AttnMaskType
self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True,
input_in_bf16=False,
attn_mask_type=AttnMaskType.causal,
scaled_masked_softmax_fusion=True,
mask_func=None,
softmax_in_fp32=True,
scale=math.sqrt(self.attention_head_size))
else:
self.softmax = nn.Softmax(dim=-1)
self.attention_dropout = col_nn.Dropout(attention_dropout)
self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True)
self.dropout = col_nn.Dropout(dropout)
def forward(self, x, attention_mask=None):
qkv = self.query_key_value(x)
q, k, v = torch.chunk(qkv, 3, dim=-1)
all_head_size = q.shape[-1]
num_attention_heads = divide(all_head_size, self.attention_head_size)
new_shape = q.shape[:-1] + \
(num_attention_heads, self.attention_head_size)
q = q.view(new_shape).permute((0, 2, 1, 3)).contiguous()
k = k.view(new_shape).permute((0, 2, 1, 3)).contiguous()
v = v.view(new_shape).permute((0, 2, 1, 3)).contiguous()
x = torch.matmul(q, k.transpose(-1, -2))
if self.fuse_scale_mask_softmax:
x = self.softmax(x, attention_mask)
else:
x = x / math.sqrt(self.attention_head_size)
# causal mask
q_len, k_len = q.size(-2), k.size(-2)
causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8,
device=get_current_device())).view(1, 1, q_len, k_len).bool()
x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device()))
if attention_mask is not None:
x = x + attention_mask
x = self.softmax(x)
x = self.attention_dropout(x)
x = torch.matmul(x, v)
x = x.transpose(1, 2)
new_context_layer_shape = x.size()[:-2] + (all_head_size,)
x = x.reshape(new_context_layer_shape)
x = self.dense(x)
x = self.dropout(x)
return x
@LAYERS.register_module
class GPTMLP(nn.Module):
def __init__(self,
dim: int,
mlp_ratio: float,
activation: Callable,
dropout: float,
dtype: dtype = None,
bias: bool = True):
super().__init__()
intermediate_dim = int(dim * mlp_ratio)
self.dense_1 = col_nn.Linear(dim, intermediate_dim, dtype=dtype, bias=bias)
self.activation = activation
self.dense_2 = col_nn.Linear(intermediate_dim, dim, dtype=dtype, bias=bias)
self.dropout = col_nn.Dropout(dropout)
def forward(self, x):
x = self.dense_1(x)
x = self.activation(x)
x = self.dense_2(x)
x = self.dropout(x)
return x
@LAYERS.register_module
class GPTBlock(CheckpointModule):
def __init__(self,
dim: int,
num_heads: int,
mlp_ratio: float,
activation: Callable,
attention_dropout: float = 0.,
dropout: float = 0.,
layernorm_epsilon: float = 1e-5,
dtype: dtype = None,
bias: bool = True,
apply_post_layernorm: bool = False,
fuse_scale_mask_softmax: bool = False,
checkpoint: bool = False,
activation_offload: bool = False):
super().__init__(checkpoint, activation_offload)
self.apply_post_layernorm = apply_post_layernorm
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.attn = GPTSelfAttention(dim=dim,
num_heads=num_heads,
attention_dropout=attention_dropout,
dropout=dropout,
bias=bias,
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
dtype=dtype)
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.mlp = GPTMLP(dim=dim, mlp_ratio=mlp_ratio, activation=activation, dropout=dropout, dtype=dtype, bias=bias)
def _forward(self, x, attention_mask=None):
if not self.apply_post_layernorm:
residual = x
x = self.norm1(x)
if self.apply_post_layernorm:
residual = x
x = residual + self.attn(x, attention_mask)
if not self.apply_post_layernorm:
residual = x
x = self.norm2(x)
if self.apply_post_layernorm:
residual = x
x = residual + self.mlp(x)
return x, attention_mask
@LAYERS.register_module
class GPTLMHead(nn.Module):
def __init__(self,
dim: int,
vocab_size: int,
word_embeeding_weight: nn.Parameter = None,
bias: bool = False,
dtype: dtype = None) -> None:
super().__init__()
self.dense = col_nn.Classifier(dim, vocab_size, word_embeeding_weight, bias=bias, dtype=dtype)
@property
def weight(self):
return self.dense.weight
def forward(self, x):
x = self.dense(x)
return x
@LOSSES.register_module
class GPTLMLoss(nn.Module):
def __init__(self):
super().__init__()
self.loss = col_nn.CrossEntropyLoss()
def forward(self, logits, labels):
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
@MODELS.register_module
class GPT(nn.Module):
def __init__(self,
vocab_size: int = 50304,
max_position_embeddings: int = 1024,
dim: int = 768,
num_heads: int = 12,
depth: int = 12,
mlp_ratio: float = 4.0,
dropout: float = 0.1,
embedding_dropout: float = 0.1,
attention_dropout: float = 0.1,
layernorm_epsilon: float = 1e-5,
activation: Callable = nn.functional.gelu,
padding_idx: int = None,
dtype: dtype = None,
bias: bool = True,
apply_post_layernorm: bool = False,
fuse_scale_mask_softmax: bool = False,
checkpoint: bool = False,
activation_offload: bool = False) -> None:
super().__init__()
self.embed = GPTEmbedding(embedding_dim=dim,
vocab_size=vocab_size,
max_position_embeddings=max_position_embeddings,
padding_idx=padding_idx,
dropout=embedding_dropout,
dtype=dtype)
self.blocks = nn.ModuleList([
GPTBlock(
dim=dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
activation=activation,
attention_dropout=attention_dropout,
dropout=dropout,
layernorm_epsilon=layernorm_epsilon,
dtype=dtype,
bias=bias,
apply_post_layernorm=apply_post_layernorm,
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
checkpoint=checkpoint,
activation_offload=activation_offload
) for _ in range(depth)
])
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.head = GPTLMHead(dim=dim,
vocab_size=vocab_size,
word_embeeding_weight=self.embed.word_embedding_weight,
dtype=dtype)
def forward(self, input_ids, attention_mask=None):
x = self.embed(input_ids)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# Adapted from huggingface
if attention_mask is not None:
batch_size = input_ids.shape[0]
attention_mask = attention_mask.view(batch_size, -1)
attention_mask = col_nn.partition_batch(attention_mask)
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0
for block in self.blocks:
x, attention_mask = block(x, attention_mask)
x = self.head(self.norm(x))
return x
class PipelineGPT(nn.Module):
def __init__(self,
vocab_size: int = 50304,
max_position_embeddings: int = 1024,
dim: int = 768,
num_heads: int = 12,
depth: int = 12,
mlp_ratio: float = 4.0,
dropout: float = 0.1,
embedding_dropout: float = 0.1,
attention_dropout: float = 0.1,
layernorm_epsilon: float = 1e-5,
activation: Callable = nn.functional.gelu,
padding_idx: int = None,
dtype: dtype = None,
bias: bool = True,
apply_post_layernorm: bool = False,
fuse_scale_mask_softmax: bool = False,
checkpoint: bool = False,
first: bool = False,
last: bool = False):
super().__init__()
self.checkpoint = checkpoint
self.first = first
self.last = last
if first:
self.embed = GPTEmbedding(embedding_dim=dim,
vocab_size=vocab_size,
max_position_embeddings=max_position_embeddings,
padding_idx=padding_idx,
dropout=embedding_dropout,
dtype=dtype)
self.blocks = nn.ModuleList([
GPTBlock(
dim=dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
activation=activation,
attention_dropout=attention_dropout,
dropout=dropout,
layernorm_epsilon=layernorm_epsilon,
dtype=dtype,
bias=bias,
apply_post_layernorm=apply_post_layernorm,
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
checkpoint=checkpoint,
) for _ in range(depth)
])
if self.last:
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.head = GPTLMHead(dim=dim, vocab_size=vocab_size, dtype=dtype)
def forward(self, x=None, input_ids=None, attention_mask=None):
if self.first:
x = self.embed(input_ids)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# Adapted from huggingface
if attention_mask is not None:
if self.first:
batch_size = input_ids.shape[0]
else:
batch_size = x.shape[0]
attention_mask = attention_mask.view(batch_size, -1)
attention_mask = col_nn.partition_batch(attention_mask)
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0
for block in self.blocks:
x, attention_mask = block(x, attention_mask)
if self.last:
x = self.head(self.norm(x))
return x
def _create_gpt_model(**model_kwargs):
model = GPT(**model_kwargs)
return model
def _create_gpt_pipeline_model(depth=48, num_chunks=1, layer_partitions=None, **model_kwargs):
logger = get_dist_logger()
pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
rank = gpc.get_global_rank()
wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
parts = partition_uniform(depth, pipeline_size,
num_chunks)[pipeline_rank] if layer_partitions is None else layer_partitions
models = []
for start, end in parts:
model_kwargs['first'] = start == 0
model_kwargs['last'] = end == depth
model_kwargs['depth'] = end - start
chunk = PipelineGPT(**model_kwargs).to(get_current_device())
if start == 0:
wrapper.register_parameter(chunk.embed.word_embedding_weight)
elif end == depth:
wrapper.register_parameter(chunk.head.weight)
models.append(chunk)
logger.info(f'==> Rank {rank} built layer {start}-{end} / total {depth}')
if len(models) == 1:
model = models[0]
else:
model = nn.ModuleList(models)
return model
@MODELS.register_module
def gpt2_small(**kwargs):
model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs)
return _create_gpt_model(**model_kwargs)
@MODELS.register_module
def gpt2_medium(**kwargs):
model_kwargs = dict(dim=1024, depth=24, num_heads=8, **kwargs)
return _create_gpt_model(**model_kwargs)
@MODELS.register_module
def gpt2_large(**kwargs):
model_kwargs = dict(dim=1536, depth=36, num_heads=12, **kwargs)
return _create_gpt_model(**model_kwargs)
@MODELS.register_module
def gpt2_xl(**kwargs):
model_kwargs = dict(dim=1600, depth=48, num_heads=16, **kwargs)
return _create_gpt_model(**model_kwargs)
@MODELS.register_module
def gpt2_8B(**kwargs):
model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
return _create_gpt_model(**model_kwargs)
@MODELS.register_module
def gpt2_xl_pipeline(**kwargs):
model_kwargs = dict(dim=1600, depth=48, num_heads=20, **kwargs)
return _create_gpt_pipeline_model(**model_kwargs)
@MODELS.register_module
def gpt2_8B_pipeline(**kwargs):
model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
return _create_gpt_pipeline_model(**model_kwargs)
@MODELS.register_module
def gpt3(**kwargs):
model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
return _create_gpt_model(**model_kwargs)
@MODELS.register_module
def gpt3_pipeline(**kwargs):
model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
return _create_gpt_pipeline_model(**model_kwargs)

View File

@ -1,26 +0,0 @@
import torch
import torch.nn as nn
from colossalai.nn.layer import WrappedDropPath as DropPath
class TransformerLayer(nn.Module):
"""Transformer layer builder.
"""
def __init__(self,
att: nn.Module,
ffn: nn.Module,
norm1: nn.Module,
norm2: nn.Module,
droppath=None,
droppath_rate: float = 0):
super().__init__()
self.att = att
self.ffn = ffn
self.norm1 = norm1
self.norm2 = norm2
self.droppath = DropPath(droppath_rate) if droppath is None else droppath
def forward(self, x):
x = x + self.droppath(self.att(self.norm1(x)))
x = x + self.droppath(self.ffn(self.norm2(x)))
return x

View File

@ -1,2 +0,0 @@
from .models import Widenet, ViTMoE
from .gpt import MOEGPT, prmoe_4b, prmoe_31b, prmoe_51b

View File

@ -1,229 +0,0 @@
from typing import Callable, List
from torch import dtype, nn
from colossalai import nn as col_nn
from colossalai.registry import LAYERS, MODELS
from colossalai.nn.layer import MoeModule
from colossalai.context import MOE_CONTEXT
from colossalai.logging import get_dist_logger
from colossalai.nn.layer.utils import CheckpointModule, divide
from model_zoo.gpt.gpt import GPTEmbedding, GPTSelfAttention, GPTMLP, GPTBlock, GPTLMHead
@LAYERS.register_module
class MOEGPTBlock(CheckpointModule):
def __init__(self,
num_experts: int,
dim: int,
num_heads: int,
mlp_ratio: float,
activation: Callable,
capacity_factor_train: float = 1.0,
capacity_factor_eval: float = 1.0,
use_residual: bool = False,
attention_dropout: float = 0.,
dropout: float = 0.,
layernorm_epsilon: float = 1e-5,
dtype: dtype = None,
bias: bool = True,
apply_post_layernorm: bool = False,
fuse_scale_mask_softmax: bool = False,
checkpoint: bool = False):
super().__init__(checkpoint)
self.apply_post_layernorm = apply_post_layernorm
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.attn = GPTSelfAttention(dim=dim,
num_heads=num_heads,
attention_dropout=attention_dropout,
dropout=dropout,
bias=bias,
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
dtype=dtype)
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
mpl_factory_dict = dict(dim=dim,
mlp_ratio=mlp_ratio,
activation=activation,
dropout=dropout,
dtype=dtype,
bias=bias)
self.mlp = MoeModule(dim_model=dim,
num_experts=num_experts,
top_k=1,
capacity_factor_train=capacity_factor_train,
capacity_factor_eval=capacity_factor_eval,
noisy_policy='Jitter',
use_residual=use_residual,
expert_cls=GPTMLP,
**mpl_factory_dict)
def _forward(self, x, attention_mask=None):
if not self.apply_post_layernorm:
residual = x
x = self.norm1(x)
if self.apply_post_layernorm:
residual = x
x = residual + self.attn(x, attention_mask)
if not self.apply_post_layernorm:
residual = x
x = self.norm2(x)
if self.apply_post_layernorm:
residual = x
x = residual + self.mlp(x)
return x, attention_mask
@MODELS.register_module
class MOEGPT(nn.Module):
def __init__(self,
num_experts: int or List[int],
use_residual: bool = False,
capacity_factor_train: float = 1.0,
capacity_factor_eval: float = 1.0,
vocab_size: int = 50304,
max_position_embeddings: int = 1024,
dim: int = 768,
num_heads: int = 12,
depth: int = 12,
mlp_ratio: float = 4.0,
dropout: float = 0.1,
embedding_dropout: float = 0.1,
attention_dropout: float = 0.1,
layernorm_epsilon: float = 1e-5,
activation: Callable = nn.functional.gelu,
padding_idx: int = None,
dtype: dtype = None,
bias: bool = True,
apply_post_layernorm: bool = False,
fuse_scale_mask_softmax: bool = False,
checkpoint: bool = False) -> None:
super().__init__()
half_depth = divide(depth, 2)
if isinstance(num_experts, list):
assert len(num_experts) == half_depth, \
"The length of num_experts should equal to the number of MOE layers"
num_experts_list = num_experts
else:
num_experts_list = [num_experts] * half_depth
self.embed = GPTEmbedding(embedding_dim=dim,
vocab_size=vocab_size,
max_position_embeddings=max_position_embeddings,
padding_idx=padding_idx,
dropout=embedding_dropout,
dtype=dtype)
block_list = []
block_factory_dict = dict(dim=dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
activation=activation,
attention_dropout=attention_dropout,
dropout=dropout,
layernorm_epsilon=layernorm_epsilon,
dtype=dtype,
bias=bias,
apply_post_layernorm=apply_post_layernorm,
fuse_scale_mask_softmax=fuse_scale_mask_softmax,
checkpoint=checkpoint)
for i in range(depth):
if i % 2 == 0:
block_module = GPTBlock(**block_factory_dict)
else:
num_experts = num_experts_list[i // 2]
block_module = MOEGPTBlock(num_experts=num_experts,
capacity_factor_train=capacity_factor_train,
capacity_factor_eval=capacity_factor_eval,
use_residual=use_residual,
**block_factory_dict)
block_list.append(block_module)
self.blocks = nn.ModuleList(block_list)
self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.head = GPTLMHead(dim=dim,
vocab_size=vocab_size,
word_embeeding_weight=self.embed.word_embedding_weight,
dtype=dtype)
def forward(self, input_ids, attention_mask=None):
MOE_CONTEXT.reset_loss()
x = self.embed(input_ids)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# Adapted from huggingface
if attention_mask is not None:
batch_size = input_ids.shape[0]
attention_mask = attention_mask.view(batch_size, -1)
attention_mask = col_nn.partition_batch(attention_mask)
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0
for block in self.blocks:
x, attention_mask = block(x, attention_mask)
x = self.head(self.norm(x))
return x
def _create_moegpt_model(**model_kwargs):
model = MOEGPT(**model_kwargs)
return model
def _prmoe_check_sanity(kwargs_dict):
logger = get_dist_logger()
if not kwargs_dict.pop('use_residual', False):
logger.warning(
"If you want to use PR-MOE, please set 'use_residual' to True. "
"Otherwise, we'll force 'use_residual' to True.",
ranks=[0])
@MODELS.register_module
def prmoe_4b(**kwargs):
_prmoe_check_sanity(kwargs)
model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64],
use_residual=True,
dim=1024,
depth=24,
num_heads=16,
**kwargs)
return _create_moegpt_model(**model_kwargs)
@MODELS.register_module
def prmoe_31b(**kwargs):
_prmoe_check_sanity(kwargs)
model_kwargs = dict(num_experts=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128],
use_residual=True,
dim=2048,
depth=24,
num_heads=16,
**kwargs)
return _create_moegpt_model(**model_kwargs)
@MODELS.register_module
def prmoe_51b(**kwargs):
_prmoe_check_sanity(kwargs)
model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64],
use_residual=True,
dim=3072,
depth=32,
num_heads=24,
**kwargs)
return _create_moegpt_model(**model_kwargs)

View File

@ -1,226 +0,0 @@
import math
import torch
import torch.nn as nn
from colossalai.context import ParallelMode
from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \
WrappedDropout as Dropout, WrappedDropPath as DropPath
from colossalai.nn.layer.moe import build_ffn_experts, MoeLayer, Top2Router, NormalNoiseGenerator, MoeModule
from .util import moe_sa_args, moe_mlp_args
from ..helper import TransformerLayer
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.utils import get_current_device
from typing import List
class VanillaSelfAttention(nn.Module):
"""Standard ViT self attention.
"""
def __init__(self,
d_model: int,
n_heads: int,
d_kv: int,
attention_drop: float = 0,
drop_rate: float = 0,
bias: bool = True,
dropout1=None,
dropout2=None):
super().__init__()
self.n_heads = n_heads
self.d_kv = d_kv
self.scale = 1.0 / math.sqrt(self.d_kv)
self.dense1 = nn.Linear(d_model, 3 * n_heads * d_kv, bias, device=get_current_device())
self.softmax = nn.Softmax(dim=-1)
self.atten_drop = nn.Dropout(attention_drop) if dropout1 is None else dropout1
self.dense2 = nn.Linear(n_heads * d_kv, d_model, device=get_current_device())
self.dropout = nn.Dropout(drop_rate) if dropout2 is None else dropout2
def forward(self, x):
qkv = self.dense1(x)
new_shape = qkv.shape[:2] + (3, self.n_heads, self.d_kv)
qkv = qkv.view(*new_shape)
qkv = qkv.permute(2, 0, 3, 1, 4)
q, k, v = qkv[:]
x = torch.matmul(q, k.transpose(-2, -1)) * self.scale
x = self.atten_drop(self.softmax(x))
x = torch.matmul(x, v)
x = x.transpose(1, 2)
new_shape = x.shape[:2] + (self.n_heads * self.d_kv,)
x = x.reshape(*new_shape)
x = self.dense2(x)
x = self.dropout(x)
return x
class VanillaFFN(nn.Module):
"""FFN composed with two linear layers, also called MLP.
"""
def __init__(self,
d_model: int,
d_ff: int,
activation=None,
drop_rate: float = 0,
bias: bool = True,
dropout1=None,
dropout2=None):
super().__init__()
dense1 = nn.Linear(d_model, d_ff, bias, device=get_current_device())
act = nn.GELU() if activation is None else activation
dense2 = nn.Linear(d_ff, d_model, bias, device=get_current_device())
drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1
drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2
self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2)
def forward(self, x):
return self.ffn(x)
class Widenet(nn.Module):
def __init__(self,
num_experts: int,
capacity_factor_train: float = 1.25,
capacity_factor_eval: float = 2.0,
drop_tks: bool = True,
img_size: int = 224,
patch_size: int = 16,
in_chans: int = 3,
num_classes: int = 1000,
depth: int = 12,
d_model: int = 768,
num_heads: int = 12,
d_kv: int = 64,
d_ff: int = 4096,
attention_drop: float = 0.,
drop_rate: float = 0.1,
drop_path: float = 0.):
super().__init__()
embedding = VanillaPatchEmbedding(img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_size=d_model)
embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
shared_sa = VanillaSelfAttention(**moe_sa_args(
d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
noisy_func = NormalNoiseGenerator(num_experts)
shared_router = Top2Router(capacity_factor_train=capacity_factor_train,
capacity_factor_eval=capacity_factor_eval,
noisy_func=noisy_func,
drop_tks=drop_tks)
shared_experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate)
# stochastic depth decay rule
dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
blocks = [
TransformerLayer(att=shared_sa,
ffn=MoeLayer(dim_model=d_model,
num_experts=num_experts,
router=shared_router,
experts=shared_experts),
norm1=nn.LayerNorm(d_model, eps=1e-6),
norm2=nn.LayerNorm(d_model, eps=1e-6),
droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) for i in range(depth)
]
norm = nn.LayerNorm(d_model, eps=1e-6)
self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes)
nn.init.zeros_(self.linear.weight)
nn.init.zeros_(self.linear.bias)
self.widenet = nn.Sequential(embedding, embed_dropout, *blocks, norm)
def forward(self, x):
MOE_CONTEXT.reset_loss()
x = self.widenet(x)
x = torch.mean(x, dim=1)
x = self.linear(x)
return x
class ViTMoE(nn.Module):
def __init__(self,
num_experts: int or List[int],
use_residual: bool = False,
capacity_factor_train: float = 1.25,
capacity_factor_eval: float = 2.0,
drop_tks: bool = True,
img_size: int = 224,
patch_size: int = 16,
in_chans: int = 3,
num_classes: int = 1000,
depth: int = 12,
d_model: int = 768,
num_heads: int = 12,
d_kv: int = 64,
d_ff: int = 3072,
attention_drop: float = 0.,
drop_rate: float = 0.1,
drop_path: float = 0.):
super().__init__()
assert depth % 2 == 0, "The number of layers should be even right now"
if isinstance(num_experts, list):
assert len(num_experts) == depth // 2, \
"The length of num_experts should equal to the number of MOE layers"
num_experts_list = num_experts
else:
num_experts_list = [num_experts] * (depth // 2)
embedding = VanillaPatchEmbedding(img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_size=d_model)
embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
# stochastic depth decay rule
dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
blocks = []
for i in range(depth):
sa = VanillaSelfAttention(**moe_sa_args(
d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
if i % 2 == 0:
ffn = VanillaFFN(**moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate))
else:
num_experts = num_experts_list[i // 2]
experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate)
ffn = MoeModule(dim_model=d_model,
num_experts=num_experts,
top_k=1 if use_residual else 2,
capacity_factor_train=capacity_factor_train,
capacity_factor_eval=capacity_factor_eval,
noisy_policy='Jitter' if use_residual else 'Gaussian',
drop_tks=drop_tks,
use_residual=use_residual,
expert_instance=experts,
expert_cls=VanillaFFN,
**moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate))
layer = TransformerLayer(att=sa,
ffn=ffn,
norm1=nn.LayerNorm(d_model, eps=1e-6),
norm2=nn.LayerNorm(d_model, eps=1e-6),
droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR))
blocks.append(layer)
norm = nn.LayerNorm(d_model, eps=1e-6)
self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes)
nn.init.zeros_(self.linear.weight)
nn.init.zeros_(self.linear.bias)
self.vitmoe = nn.Sequential(embedding, embed_dropout, *blocks, norm)
def forward(self, x):
MOE_CONTEXT.reset_loss()
x = self.vitmoe(x)
x = torch.mean(x, dim=1)
x = self.linear(x)
return x

View File

@ -1,41 +0,0 @@
from colossalai.context import ParallelMode
from colossalai.nn.layer import WrappedDropout as Dropout
def moe_sa_args(d_model: int,
n_heads: int,
d_kv: int,
attention_drop: float = 0,
drop_rate: float = 0,
bias: bool = True):
"""This is an example for args in moe self attention, since lots of modules should be
adapted before putting them in experts.
"""
dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR)
dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
return dict(
d_model=d_model,
n_heads=n_heads,
d_kv=d_kv,
bias=bias,
dropout1=dropout1,
dropout2=dropout2
)
def moe_mlp_args(d_model: int,
d_ff: int,
drop_rate: float,
bias: bool = True):
"""This is an example for args of MLP in Experts, since lots of modules should be adapted
before putting them in experts.
"""
dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
return dict(
d_model=d_model,
d_ff=d_ff,
bias=bias,
dropout1=dropout1,
dropout2=dropout2
)

View File

@ -1 +0,0 @@
from .vit import *

View File

@ -1,87 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
from colossalai.registry import MODELS
from colossalai.nn.model.model_from_config import ModelFromConfig
@MODELS.register_module
class VisionTransformerFromConfig(ModelFromConfig):
"""Vision Transformer from
`"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/pdf/2010.11929>`_.
"""
def __init__(self,
embedding_cfg: dict,
norm_cfg: dict,
block_cfg: dict,
head_cfg: dict,
token_fusion_cfg: dict = None,
embed_dim=768,
depth=12,
drop_path_rate=0.,
tensor_splitting_cfg: dict = None):
super().__init__()
self.embed_dim = embed_dim
self.num_tokens = 1
self.tensor_splitting_cfg = tensor_splitting_cfg
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule
if token_fusion_cfg is None:
token_fusion_cfg = []
else:
token_fusion_cfg = [token_fusion_cfg]
self.layers_cfg = [
embedding_cfg,
# input tensor splitting
*self._generate_tensor_splitting_cfg(),
*token_fusion_cfg,
# blocks
*self._generate_block_cfg(
dpr=dpr, block_cfg=block_cfg, depth=depth),
# norm
norm_cfg,
# head
head_cfg
]
def _fuse_tokens(self, x):
cls_token = self.cls_token.expand(x.shape[0], -1, -1)
x = torch.cat((cls_token, x), dim=1)
return x
def _generate_block_cfg(self, dpr, depth, block_cfg):
blocks_cfg = []
for i in range(depth):
_cfg = block_cfg.copy()
_cfg['droppath_cfg']['drop_path'] = dpr[i]
blocks_cfg.append(_cfg)
return blocks_cfg
def _generate_tensor_splitting_cfg(self):
if self.tensor_splitting_cfg:
return [self.tensor_splitting_cfg]
else:
return []
def forward(self, x): # [512, 3, 32, 32]
for layer in self.layers:
if isinstance(x, tuple):
x = layer(*x)
else:
x = layer(x)
return x # [256, 5]
def init_weights(self):
# TODO: add init weights
pass

View File

@ -1,415 +0,0 @@
import math
from typing import Callable
import torch
from colossalai import nn as col_nn
from colossalai.nn.layer.utils import CheckpointModule
from colossalai.registry import LAYERS, MODELS
from torch import dtype, nn
__all__ = [
'VisionTransformer',
'vit_lite_depth7_patch4_32',
'vit_tiny_patch4_32',
'vit_tiny_patch16_224',
'vit_tiny_patch16_384',
'vit_small_patch16_224',
'vit_small_patch16_384',
'vit_small_patch32_224',
'vit_small_patch32_384',
'vit_base_patch16_224',
'vit_base_patch16_384',
'vit_base_patch32_224',
'vit_base_patch32_384',
'vit_large_patch16_224',
'vit_large_patch16_384',
'vit_large_patch32_224',
'vit_large_patch32_384',
]
_init_rules = dict(
torch=dict(
embed=dict(
weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
position_embed_initializer=col_nn.init.zeros_(),
),
transformer=dict(
weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
),
head=dict(
weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
),
),
jax=dict(
embed=dict(
weight_initializer=col_nn.init.lecun_normal_(),
bias_initializer=col_nn.init.zeros_(),
position_embed_initializer=col_nn.init.trunc_normal_(std=.02),
),
transformer=dict(
weight_initializer=col_nn.init.xavier_uniform_(),
bias_initializer=col_nn.init.normal_(std=1e-6),
),
head=dict(
weight_initializer=col_nn.init.zeros_(),
bias_initializer=col_nn.init.zeros_(),
),
),
)
@LAYERS.register_module
class ViTEmbedding(nn.Module):
def __init__(self,
img_size: int,
patch_size: int,
in_chans: int,
embedding_dim: int,
dropout: float,
dtype: dtype = None,
flatten: bool = True,
init_method: str = 'torch'):
super().__init__()
self.patch_embed = col_nn.PatchEmbedding(img_size,
patch_size,
in_chans,
embedding_dim,
dtype=dtype,
flatten=flatten,
**_init_rules[init_method]['embed'])
self.dropout = col_nn.Dropout(dropout)
def forward(self, x):
x = self.patch_embed(x)
x = self.dropout(x)
return x
@LAYERS.register_module
class ViTSelfAttention(nn.Module):
def __init__(self,
dim: int,
num_heads: int,
attention_dropout: float,
dropout: float,
bias: bool = True,
dtype: dtype = None,
init_method: str = 'torch'):
super().__init__()
self.attention_head_size = dim // num_heads
self.query_key_value = col_nn.Linear(dim,
3 * dim,
dtype=dtype,
bias=bias,
**_init_rules[init_method]['transformer'])
self.attention_dropout = col_nn.Dropout(attention_dropout)
self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True, **_init_rules[init_method]['transformer'])
self.dropout = col_nn.Dropout(dropout)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
qkv = self.query_key_value(x)
all_head_size = qkv.shape[-1] // 3
num_attention_heads = all_head_size // self.attention_head_size
new_qkv_shape = qkv.shape[:-1] + \
(num_attention_heads, 3 * self.attention_head_size)
qkv = qkv.view(new_qkv_shape)
qkv = qkv.permute((0, 2, 1, 3))
q, k, v = torch.chunk(qkv, 3, dim=-1)
x = torch.matmul(q, k.transpose(-1, -2))
x = x / math.sqrt(self.attention_head_size)
x = self.softmax(x)
x = self.attention_dropout(x)
x = torch.matmul(x, v)
x = x.transpose(1, 2)
new_context_layer_shape = x.size()[:-2] + (all_head_size, )
x = x.reshape(new_context_layer_shape)
x = self.dense(x)
x = self.dropout(x)
return x
@LAYERS.register_module
class ViTMLP(nn.Module):
def __init__(self,
dim: int,
mlp_ratio: int,
activation: Callable,
dropout: float,
dtype: dtype = None,
bias: bool = True,
init_method: str = 'torch'):
super().__init__()
self.dense_1 = col_nn.Linear(dim,
mlp_ratio * dim,
dtype=dtype,
bias=bias,
**_init_rules[init_method]['transformer'])
self.activation = activation
self.dropout_1 = col_nn.Dropout(dropout)
self.dense_2 = col_nn.Linear(mlp_ratio * dim,
dim,
dtype=dtype,
bias=bias,
**_init_rules[init_method]['transformer'])
self.dropout_2 = col_nn.Dropout(dropout)
def forward(self, x):
x = self.dense_1(x)
x = self.activation(x)
x = self.dropout_1(x)
x = self.dense_2(x)
x = self.dropout_2(x)
return x
@LAYERS.register_module
class ViTHead(nn.Module):
def __init__(self,
dim: int,
num_classes: int,
representation_size: int = None,
dtype: dtype = None,
bias: bool = True,
init_method: str = 'torch'):
super().__init__()
if representation_size:
self.representation = col_nn.Linear(dim,
representation_size,
bias=bias,
dtype=dtype,
**_init_rules[init_method]['head'])
else:
self.representation = None
representation_size = dim
self.dense = col_nn.Classifier(representation_size,
num_classes,
dtype=dtype,
bias=bias,
**_init_rules[init_method]['head'])
def forward(self, x):
x = x[:, 0]
if self.representation is not None:
x = self.representation(x)
x = self.dense(x)
return x
@LAYERS.register_module
class ViTBlock(CheckpointModule):
def __init__(self,
dim: int,
num_heads: int,
mlp_ratio: int,
activation: Callable,
attention_dropout: float = 0.,
dropout: float = 0.,
drop_path: float = 0.,
layernorm_epsilon: float = 1e-6,
dtype: dtype = None,
bias: bool = True,
checkpoint: bool = False,
init_method: str = 'torch'):
super().__init__(checkpoint)
self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.attn = ViTSelfAttention(dim=dim,
num_heads=num_heads,
attention_dropout=attention_dropout,
dropout=dropout,
bias=bias,
dtype=dtype,
init_method=init_method)
self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
self.mlp = ViTMLP(dim=dim,
mlp_ratio=mlp_ratio,
activation=activation,
dropout=dropout,
dtype=dtype,
bias=bias,
init_method=init_method)
def _forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
@MODELS.register_module
class VisionTransformer(nn.Module):
def __init__(self,
img_size: int = 224,
patch_size: int = 16,
in_chans: int = 3,
num_classes: int = 1000,
depth: int = 12,
num_heads: int = 12,
dim: int = 768,
mlp_ratio: int = 4,
attention_dropout: float = 0.,
dropout: float = 0.1,
drop_path: float = 0.,
layernorm_epsilon: float = 1e-6,
activation: Callable = nn.functional.gelu,
representation_size: int = None,
dtype: dtype = None,
bias: bool = True,
checkpoint: bool = False,
init_method: str = 'torch'):
super().__init__()
embed = ViTEmbedding(img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embedding_dim=dim,
dropout=dropout,
dtype=dtype,
init_method=init_method)
# stochastic depth decay rule
dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
blocks = [
ViTBlock(
dim=dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
attention_dropout=attention_dropout,
dropout=dropout,
drop_path=dpr[i],
activation=activation,
dtype=dtype,
bias=bias,
checkpoint=checkpoint,
init_method=init_method,
) for i in range(depth)
]
norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
head = ViTHead(dim=dim,
num_classes=num_classes,
representation_size=representation_size,
dtype=dtype,
bias=bias,
init_method=init_method)
self.layers = nn.Sequential(
embed,
*blocks,
norm,
head,
)
def forward(self, x):
x = self.layers(x)
return x
def _create_vit_model(**model_kwargs):
model = VisionTransformer(**model_kwargs)
return model
@MODELS.register_module
def vit_lite_depth7_patch4_32(**kwargs):
model_kwargs = dict(img_size=32, patch_size=4, dim=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_tiny_patch4_32(**kwargs):
model_kwargs = dict(img_size=32, patch_size=4, dim=512, depth=6, num_heads=8, mlp_ratio=1, num_classes=10, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_tiny_patch16_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_tiny_patch16_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_small_patch16_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_small_patch16_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_small_patch32_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_small_patch32_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_base_patch16_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_base_patch16_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_base_patch32_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_base_patch32_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_large_patch16_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_large_patch16_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_large_patch32_224(**kwargs):
model_kwargs = dict(img_size=224, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)
@MODELS.register_module
def vit_large_patch32_384(**kwargs):
model_kwargs = dict(img_size=384, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
return _create_vit_model(**model_kwargs)

View File

@ -1,3 +1,4 @@
pytest
torchvision
transformers
titans

View File

@ -1,4 +1,5 @@
import os
from functools import partial
from pathlib import Path
@ -6,19 +7,21 @@ import colossalai
import pytest
import torch
import torch.multiprocessing as mp
from colossalai.amp.amp_type import AMP_TYPE
from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import PipelineSchedule
from colossalai.logging import get_dist_logger
from colossalai.nn import LinearWarmupLR
from colossalai.nn.loss import CrossEntropyLoss
from colossalai.amp import AMP_TYPE
from colossalai.trainer import Trainer, hooks
from colossalai.utils import free_port, get_dataloader
from colossalai.engine.gradient_accumulation import GradAccumLrSchedulerByStep
from colossalai.context import ParallelMode
from colossalai.testing import rerun_if_address_is_in_use
from model_zoo.vit import vit_tiny_patch4_32
from torchvision import transforms
from torchvision.datasets import CIFAR10
from colossalai.utils import free_port
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.nn import CrossEntropyLoss
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.utils import is_using_pp, get_dataloader
from colossalai.utils.model.pipelinable import PipelinableContext
from tqdm import tqdm
from titans.dataloader.cifar10 import build_cifar
from titans.model.vit import vit_tiny_patch4_32
BATCH_SIZE = 4
NUM_EPOCHS = 60
@ -34,35 +37,35 @@ def run_trainer(rank, world_size, port):
logger = get_dist_logger()
model = vit_tiny_patch4_32()
pipe_model = build_pipeline_model(model.layers, num_chunks=1)
# get logger
logger = get_dist_logger()
# build dataloaders
transform_train = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
pipelinable = PipelinableContext()
with pipelinable:
model = vit_tiny_patch4_32()
pipelinable.to_layer_list()
pipelinable.load_policy("uniform")
model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
train_dataset = CIFAR10(root=Path(os.environ['DATA']), train=True, download=True, transform=transform_train)
train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True)
# craete dataloaders
root = Path(os.environ['DATA'])
train_dataloader, test_dataloader = build_cifar(BATCH_SIZE, root, pad_if_needed=True, crop=32, resize=32)
# build criterion
criterion = CrossEntropyLoss()
# create loss function
criterion = CrossEntropyLoss(label_smoothing=0.1)
# optimizer
optimizer = torch.optim.Adam(pipe_model.parameters(), lr=0.001, weight_decay=0)
# create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)
# lr_scheduler
steps_per_epoch = GradAccumLrSchedulerByStep.compute_effective_steps_per_epoch(train_dataloader, accumulate_size=2)
total_steps = steps_per_epoch * NUM_EPOCHS
warmup_steps = steps_per_epoch * WARMUP_EPOCHS
lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps)
# create lr scheduler
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
engine, train_dataloader, _, lr_scheduler = colossalai.initialize(pipe_model,
optimizer,
criterion,
train_dataloader,
lr_scheduler=lr_scheduler)
# intiailize
engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader)
logger = get_dist_logger()