[titans]remove model zoo (#1042)

* [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4. * rm model zoo
2022-05-31 10:40:47 +08:00 · 2022-05-31 10:40:47 +08:00 · 9feff0f760
parent 0dac86866b
commit 9feff0f760
13 changed files with 38 additions and 1540 deletions
--- a/model_zoo/init.py
+++ b/model_zoo/init.py
--- a/model_zoo/gpt/init.py
+++ b/model_zoo/gpt/init.py
@ -1 +0,0 @@
-from .gpt import *
--- a/model_zoo/gpt/gpt.py
+++ b/model_zoo/gpt/gpt.py
@ -1,478 +0,0 @@
-import math
-from typing import Callable
-
-import torch
-from colossalai import nn as col_nn
-from colossalai.builder.pipeline import partition_uniform
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.utils import CheckpointModule, divide
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
-from colossalai.registry import LAYERS, LOSSES, MODELS
-from colossalai.utils import get_current_device
-from torch import dtype, nn
-
-__all__ = [
-    'GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt2_xl_pipeline',
-    'gpt2_8B_pipeline', 'gpt3', 'gpt3_pipeline'
-]
-
-
-@LAYERS.register_module
-class GPTEmbedding(nn.Module):
-
-    def __init__(self,
-                 embedding_dim: int,
-                 vocab_size: int,
-                 max_position_embeddings: int,
-                 num_tokentypes: int = 0,
-                 padding_idx: int = None,
-                 dropout: float = 0.,
-                 dtype: dtype = None) -> None:
-        super().__init__()
-        self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype)
-        self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype)
-        if num_tokentypes > 0:
-            self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype)
-        else:
-            self.tokentype_embeddings = None
-        self.dropout = col_nn.Dropout(dropout)
-
-    @property
-    def word_embedding_weight(self):
-        return self.word_embeddings.weight
-
-    def forward(self, input_ids, position_ids=None, tokentype_ids=None):
-        seq_length = input_ids.size(1)
-        if position_ids is None:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0)
-        x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids)
-        if self.tokentype_embeddings is not None and tokentype_ids is not None:
-            x = x + self.tokentype_embeddings(tokentype_ids)
-        x = self.dropout(x)
-
-        return x
-
-
-@LAYERS.register_module
-class GPTSelfAttention(nn.Module):
-
-    def __init__(self,
-                 dim: int,
-                 num_heads: int,
-                 attention_dropout: float,
-                 dropout: float,
-                 bias: bool = True,
-                 fuse_scale_mask_softmax: bool = False,
-                 dtype: dtype = None) -> None:
-        super().__init__()
-        self.fuse_scale_mask_softmax = fuse_scale_mask_softmax
-        self.attention_head_size = divide(dim, num_heads)
-        self.query_key_value = col_nn.Linear(dim, 3 * dim, dtype=dtype, bias=bias)
-        if fuse_scale_mask_softmax:
-            from colossalai.kernel import FusedScaleMaskSoftmax
-            from colossalai.kernel.cuda_native.scaled_softmax import \
-                AttnMaskType
-            self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True,
-                                                 input_in_bf16=False,
-                                                 attn_mask_type=AttnMaskType.causal,
-                                                 scaled_masked_softmax_fusion=True,
-                                                 mask_func=None,
-                                                 softmax_in_fp32=True,
-                                                 scale=math.sqrt(self.attention_head_size))
-        else:
-            self.softmax = nn.Softmax(dim=-1)
-        self.attention_dropout = col_nn.Dropout(attention_dropout)
-        self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True)
-        self.dropout = col_nn.Dropout(dropout)
-
-    def forward(self, x, attention_mask=None):
-        qkv = self.query_key_value(x)
-        q, k, v = torch.chunk(qkv, 3, dim=-1)
-        all_head_size = q.shape[-1]
-        num_attention_heads = divide(all_head_size, self.attention_head_size)
-        new_shape = q.shape[:-1] + \
-            (num_attention_heads, self.attention_head_size)
-        q = q.view(new_shape).permute((0, 2, 1, 3)).contiguous()
-        k = k.view(new_shape).permute((0, 2, 1, 3)).contiguous()
-        v = v.view(new_shape).permute((0, 2, 1, 3)).contiguous()
-
-        x = torch.matmul(q, k.transpose(-1, -2))
-
-        if self.fuse_scale_mask_softmax:
-            x = self.softmax(x, attention_mask)
-        else:
-            x = x / math.sqrt(self.attention_head_size)
-            # causal mask
-            q_len, k_len = q.size(-2), k.size(-2)
-            causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8,
-                                                device=get_current_device())).view(1, 1, q_len, k_len).bool()
-            x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device()))
-            if attention_mask is not None:
-                x = x + attention_mask
-            x = self.softmax(x)
-
-        x = self.attention_dropout(x)
-
-        x = torch.matmul(x, v)
-        x = x.transpose(1, 2)
-        new_context_layer_shape = x.size()[:-2] + (all_head_size,)
-        x = x.reshape(new_context_layer_shape)
-
-        x = self.dense(x)
-        x = self.dropout(x)
-
-        return x
-
-
-@LAYERS.register_module
-class GPTMLP(nn.Module):
-
-    def __init__(self,
-                 dim: int,
-                 mlp_ratio: float,
-                 activation: Callable,
-                 dropout: float,
-                 dtype: dtype = None,
-                 bias: bool = True):
-        super().__init__()
-        intermediate_dim = int(dim * mlp_ratio)
-        self.dense_1 = col_nn.Linear(dim, intermediate_dim, dtype=dtype, bias=bias)
-        self.activation = activation
-        self.dense_2 = col_nn.Linear(intermediate_dim, dim, dtype=dtype, bias=bias)
-        self.dropout = col_nn.Dropout(dropout)
-
-    def forward(self, x):
-        x = self.dense_1(x)
-        x = self.activation(x)
-        x = self.dense_2(x)
-        x = self.dropout(x)
-        return x
-
-
-@LAYERS.register_module
-class GPTBlock(CheckpointModule):
-
-    def __init__(self,
-                 dim: int,
-                 num_heads: int,
-                 mlp_ratio: float,
-                 activation: Callable,
-                 attention_dropout: float = 0.,
-                 dropout: float = 0.,
-                 layernorm_epsilon: float = 1e-5,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 apply_post_layernorm: bool = False,
-                 fuse_scale_mask_softmax: bool = False,
-                 checkpoint: bool = False,
-                 activation_offload: bool = False):
-        super().__init__(checkpoint, activation_offload)
-        self.apply_post_layernorm = apply_post_layernorm
-        self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-        self.attn = GPTSelfAttention(dim=dim,
-                                     num_heads=num_heads,
-                                     attention_dropout=attention_dropout,
-                                     dropout=dropout,
-                                     bias=bias,
-                                     fuse_scale_mask_softmax=fuse_scale_mask_softmax,
-                                     dtype=dtype)
-        self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-        self.mlp = GPTMLP(dim=dim, mlp_ratio=mlp_ratio, activation=activation, dropout=dropout, dtype=dtype, bias=bias)
-
-    def _forward(self, x, attention_mask=None):
-        if not self.apply_post_layernorm:
-            residual = x
-        x = self.norm1(x)
-        if self.apply_post_layernorm:
-            residual = x
-        x = residual + self.attn(x, attention_mask)
-
-        if not self.apply_post_layernorm:
-            residual = x
-        x = self.norm2(x)
-        if self.apply_post_layernorm:
-            residual = x
-        x = residual + self.mlp(x)
-
-        return x, attention_mask
-
-
-@LAYERS.register_module
-class GPTLMHead(nn.Module):
-
-    def __init__(self,
-                 dim: int,
-                 vocab_size: int,
-                 word_embeeding_weight: nn.Parameter = None,
-                 bias: bool = False,
-                 dtype: dtype = None) -> None:
-        super().__init__()
-        self.dense = col_nn.Classifier(dim, vocab_size, word_embeeding_weight, bias=bias, dtype=dtype)
-
-    @property
-    def weight(self):
-        return self.dense.weight
-
-    def forward(self, x):
-        x = self.dense(x)
-        return x
-
-
-@LOSSES.register_module
-class GPTLMLoss(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.loss = col_nn.CrossEntropyLoss()
-
-    def forward(self, logits, labels):
-        shift_logits = logits[..., :-1, :].contiguous()
-        shift_labels = labels[..., 1:].contiguous()
-        # Flatten the tokens
-        return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-
-@MODELS.register_module
-class GPT(nn.Module):
-
-    def __init__(self,
-                 vocab_size: int = 50304,
-                 max_position_embeddings: int = 1024,
-                 dim: int = 768,
-                 num_heads: int = 12,
-                 depth: int = 12,
-                 mlp_ratio: float = 4.0,
-                 dropout: float = 0.1,
-                 embedding_dropout: float = 0.1,
-                 attention_dropout: float = 0.1,
-                 layernorm_epsilon: float = 1e-5,
-                 activation: Callable = nn.functional.gelu,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 apply_post_layernorm: bool = False,
-                 fuse_scale_mask_softmax: bool = False,
-                 checkpoint: bool = False,
-                 activation_offload: bool = False) -> None:
-        super().__init__()
-        self.embed = GPTEmbedding(embedding_dim=dim,
-                                  vocab_size=vocab_size,
-                                  max_position_embeddings=max_position_embeddings,
-                                  padding_idx=padding_idx,
-                                  dropout=embedding_dropout,
-                                  dtype=dtype)
-        self.blocks = nn.ModuleList([
-            GPTBlock(
-                dim=dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                activation=activation,
-                attention_dropout=attention_dropout,
-                dropout=dropout,
-                layernorm_epsilon=layernorm_epsilon,
-                dtype=dtype,
-                bias=bias,
-                apply_post_layernorm=apply_post_layernorm,
-                fuse_scale_mask_softmax=fuse_scale_mask_softmax,
-                checkpoint=checkpoint,
-                activation_offload=activation_offload
-            ) for _ in range(depth)
-        ])
-
-        self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-
-        self.head = GPTLMHead(dim=dim,
-                              vocab_size=vocab_size,
-                              word_embeeding_weight=self.embed.word_embedding_weight,
-                              dtype=dtype)
-
-    def forward(self, input_ids, attention_mask=None):
-        x = self.embed(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # Adapted from huggingface
-        if attention_mask is not None:
-            batch_size = input_ids.shape[0]
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = col_nn.partition_batch(attention_mask)
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            attention_mask = attention_mask.to(dtype=x.dtype)    # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
-        for block in self.blocks:
-            x, attention_mask = block(x, attention_mask)
-
-        x = self.head(self.norm(x))
-
-        return x
-
-
-class PipelineGPT(nn.Module):
-
-    def __init__(self,
-                 vocab_size: int = 50304,
-                 max_position_embeddings: int = 1024,
-                 dim: int = 768,
-                 num_heads: int = 12,
-                 depth: int = 12,
-                 mlp_ratio: float = 4.0,
-                 dropout: float = 0.1,
-                 embedding_dropout: float = 0.1,
-                 attention_dropout: float = 0.1,
-                 layernorm_epsilon: float = 1e-5,
-                 activation: Callable = nn.functional.gelu,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 apply_post_layernorm: bool = False,
-                 fuse_scale_mask_softmax: bool = False,
-                 checkpoint: bool = False,
-                 first: bool = False,
-                 last: bool = False):
-        super().__init__()
-        self.checkpoint = checkpoint
-        self.first = first
-        self.last = last
-        if first:
-            self.embed = GPTEmbedding(embedding_dim=dim,
-                                      vocab_size=vocab_size,
-                                      max_position_embeddings=max_position_embeddings,
-                                      padding_idx=padding_idx,
-                                      dropout=embedding_dropout,
-                                      dtype=dtype)
-        self.blocks = nn.ModuleList([
-            GPTBlock(
-                dim=dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                activation=activation,
-                attention_dropout=attention_dropout,
-                dropout=dropout,
-                layernorm_epsilon=layernorm_epsilon,
-                dtype=dtype,
-                bias=bias,
-                apply_post_layernorm=apply_post_layernorm,
-                fuse_scale_mask_softmax=fuse_scale_mask_softmax,
-                checkpoint=checkpoint,
-            ) for _ in range(depth)
-        ])
-        if self.last:
-            self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-            self.head = GPTLMHead(dim=dim, vocab_size=vocab_size, dtype=dtype)
-
-    def forward(self, x=None, input_ids=None, attention_mask=None):
-        if self.first:
-            x = self.embed(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # Adapted from huggingface
-        if attention_mask is not None:
-            if self.first:
-                batch_size = input_ids.shape[0]
-            else:
-                batch_size = x.shape[0]
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = col_nn.partition_batch(attention_mask)
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            attention_mask = attention_mask.to(dtype=x.dtype)    # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
-        for block in self.blocks:
-            x, attention_mask = block(x, attention_mask)
-
-        if self.last:
-            x = self.head(self.norm(x))
-
-        return x
-
-
-def _create_gpt_model(**model_kwargs):
-    model = GPT(**model_kwargs)
-    return model
-
-
-def _create_gpt_pipeline_model(depth=48, num_chunks=1, layer_partitions=None, **model_kwargs):
-    logger = get_dist_logger()
-    pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
-    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-    rank = gpc.get_global_rank()
-    wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1])
-    parts = partition_uniform(depth, pipeline_size,
-                              num_chunks)[pipeline_rank] if layer_partitions is None else layer_partitions
-    models = []
-    for start, end in parts:
-        model_kwargs['first'] = start == 0
-        model_kwargs['last'] = end == depth
-        model_kwargs['depth'] = end - start
-        chunk = PipelineGPT(**model_kwargs).to(get_current_device())
-        if start == 0:
-            wrapper.register_parameter(chunk.embed.word_embedding_weight)
-        elif end == depth:
-            wrapper.register_parameter(chunk.head.weight)
-        models.append(chunk)
-        logger.info(f'==> Rank {rank} built layer {start}-{end} / total {depth}')
-    if len(models) == 1:
-        model = models[0]
-    else:
-        model = nn.ModuleList(models)
-    return model
-
-
-@MODELS.register_module
-def gpt2_small(**kwargs):
-    model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs)
-    return _create_gpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt2_medium(**kwargs):
-    model_kwargs = dict(dim=1024, depth=24, num_heads=8, **kwargs)
-    return _create_gpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt2_large(**kwargs):
-    model_kwargs = dict(dim=1536, depth=36, num_heads=12, **kwargs)
-    return _create_gpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt2_xl(**kwargs):
-    model_kwargs = dict(dim=1600, depth=48, num_heads=16, **kwargs)
-    return _create_gpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt2_8B(**kwargs):
-    model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
-    return _create_gpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt2_xl_pipeline(**kwargs):
-    model_kwargs = dict(dim=1600, depth=48, num_heads=20, **kwargs)
-    return _create_gpt_pipeline_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt2_8B_pipeline(**kwargs):
-    model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs)
-    return _create_gpt_pipeline_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt3(**kwargs):
-    model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
-    return _create_gpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def gpt3_pipeline(**kwargs):
-    model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs)
-    return _create_gpt_pipeline_model(**model_kwargs)
--- a/model_zoo/helper.py
+++ b/model_zoo/helper.py
@ -1,26 +0,0 @@
-import torch
-import torch.nn as nn
-from colossalai.nn.layer import WrappedDropPath as DropPath
-
-
-class TransformerLayer(nn.Module):
-    """Transformer layer builder.
-    """
-    def __init__(self,
-                 att: nn.Module,
-                 ffn: nn.Module,
-                 norm1: nn.Module,
-                 norm2: nn.Module,
-                 droppath=None,
-                 droppath_rate: float = 0):
-        super().__init__()
-        self.att = att
-        self.ffn = ffn
-        self.norm1 = norm1
-        self.norm2 = norm2
-        self.droppath = DropPath(droppath_rate) if droppath is None else droppath
-
-    def forward(self, x):
-        x = x + self.droppath(self.att(self.norm1(x)))
-        x = x + self.droppath(self.ffn(self.norm2(x)))
-        return x
--- a/model_zoo/moe/init.py
+++ b/model_zoo/moe/init.py
@ -1,2 +0,0 @@
-from .models import Widenet, ViTMoE
-from .gpt import MOEGPT, prmoe_4b, prmoe_31b, prmoe_51b
--- a/model_zoo/moe/gpt.py
+++ b/model_zoo/moe/gpt.py
@ -1,229 +0,0 @@
-from typing import Callable, List
-from torch import dtype, nn
-from colossalai import nn as col_nn
-from colossalai.registry import LAYERS, MODELS
-from colossalai.nn.layer import MoeModule
-from colossalai.context import MOE_CONTEXT
-from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.utils import CheckpointModule, divide
-from model_zoo.gpt.gpt import GPTEmbedding, GPTSelfAttention, GPTMLP, GPTBlock, GPTLMHead
-
-
-@LAYERS.register_module
-class MOEGPTBlock(CheckpointModule):
-
-    def __init__(self,
-                 num_experts: int,
-                 dim: int,
-                 num_heads: int,
-                 mlp_ratio: float,
-                 activation: Callable,
-                 capacity_factor_train: float = 1.0,
-                 capacity_factor_eval: float = 1.0,
-                 use_residual: bool = False,
-                 attention_dropout: float = 0.,
-                 dropout: float = 0.,
-                 layernorm_epsilon: float = 1e-5,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 apply_post_layernorm: bool = False,
-                 fuse_scale_mask_softmax: bool = False,
-                 checkpoint: bool = False):
-        super().__init__(checkpoint)
-        self.apply_post_layernorm = apply_post_layernorm
-        self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-        self.attn = GPTSelfAttention(dim=dim,
-                                     num_heads=num_heads,
-                                     attention_dropout=attention_dropout,
-                                     dropout=dropout,
-                                     bias=bias,
-                                     fuse_scale_mask_softmax=fuse_scale_mask_softmax,
-                                     dtype=dtype)
-        self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-
-        mpl_factory_dict = dict(dim=dim,
-                                mlp_ratio=mlp_ratio,
-                                activation=activation,
-                                dropout=dropout,
-                                dtype=dtype,
-                                bias=bias)
-
-        self.mlp = MoeModule(dim_model=dim,
-                             num_experts=num_experts,
-                             top_k=1,
-                             capacity_factor_train=capacity_factor_train,
-                             capacity_factor_eval=capacity_factor_eval,
-                             noisy_policy='Jitter',
-                             use_residual=use_residual,
-                             expert_cls=GPTMLP,
-                             **mpl_factory_dict)
-
-    def _forward(self, x, attention_mask=None):
-        if not self.apply_post_layernorm:
-            residual = x
-        x = self.norm1(x)
-        if self.apply_post_layernorm:
-            residual = x
-        x = residual + self.attn(x, attention_mask)
-
-        if not self.apply_post_layernorm:
-            residual = x
-        x = self.norm2(x)
-        if self.apply_post_layernorm:
-            residual = x
-        x = residual + self.mlp(x)
-
-        return x, attention_mask
-
-
-@MODELS.register_module
-class MOEGPT(nn.Module):
-
-    def __init__(self,
-                 num_experts: int or List[int],
-                 use_residual: bool = False,
-                 capacity_factor_train: float = 1.0,
-                 capacity_factor_eval: float = 1.0,
-                 vocab_size: int = 50304,
-                 max_position_embeddings: int = 1024,
-                 dim: int = 768,
-                 num_heads: int = 12,
-                 depth: int = 12,
-                 mlp_ratio: float = 4.0,
-                 dropout: float = 0.1,
-                 embedding_dropout: float = 0.1,
-                 attention_dropout: float = 0.1,
-                 layernorm_epsilon: float = 1e-5,
-                 activation: Callable = nn.functional.gelu,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 apply_post_layernorm: bool = False,
-                 fuse_scale_mask_softmax: bool = False,
-                 checkpoint: bool = False) -> None:
-        super().__init__()
-
-        half_depth = divide(depth, 2)
-        if isinstance(num_experts, list):
-            assert len(num_experts) == half_depth, \
-                "The length of num_experts should equal to the number of MOE layers"
-            num_experts_list = num_experts
-        else:
-            num_experts_list = [num_experts] * half_depth
-
-        self.embed = GPTEmbedding(embedding_dim=dim,
-                                  vocab_size=vocab_size,
-                                  max_position_embeddings=max_position_embeddings,
-                                  padding_idx=padding_idx,
-                                  dropout=embedding_dropout,
-                                  dtype=dtype)
-
-        block_list = []
-        block_factory_dict = dict(dim=dim,
-                                  num_heads=num_heads,
-                                  mlp_ratio=mlp_ratio,
-                                  activation=activation,
-                                  attention_dropout=attention_dropout,
-                                  dropout=dropout,
-                                  layernorm_epsilon=layernorm_epsilon,
-                                  dtype=dtype,
-                                  bias=bias,
-                                  apply_post_layernorm=apply_post_layernorm,
-                                  fuse_scale_mask_softmax=fuse_scale_mask_softmax,
-                                  checkpoint=checkpoint)
-
-        for i in range(depth):
-
-            if i % 2 == 0:
-                block_module = GPTBlock(**block_factory_dict)
-            else:
-                num_experts = num_experts_list[i // 2]
-                block_module = MOEGPTBlock(num_experts=num_experts,
-                                           capacity_factor_train=capacity_factor_train,
-                                           capacity_factor_eval=capacity_factor_eval,
-                                           use_residual=use_residual,
-                                           **block_factory_dict)
-
-            block_list.append(block_module)
-
-        self.blocks = nn.ModuleList(block_list)
-
-        self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-
-        self.head = GPTLMHead(dim=dim,
-                              vocab_size=vocab_size,
-                              word_embeeding_weight=self.embed.word_embedding_weight,
-                              dtype=dtype)
-
-    def forward(self, input_ids, attention_mask=None):
-        MOE_CONTEXT.reset_loss()
-        x = self.embed(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # Adapted from huggingface
-        if attention_mask is not None:
-            batch_size = input_ids.shape[0]
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = col_nn.partition_batch(attention_mask)
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            attention_mask = attention_mask.to(dtype=x.dtype)    # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * -10000.0
-
-        for block in self.blocks:
-            x, attention_mask = block(x, attention_mask)
-
-        x = self.head(self.norm(x))
-
-        return x
-
-
-def _create_moegpt_model(**model_kwargs):
-    model = MOEGPT(**model_kwargs)
-    return model
-
-
-def _prmoe_check_sanity(kwargs_dict):
-    logger = get_dist_logger()
-    if not kwargs_dict.pop('use_residual', False):
-        logger.warning(
-            "If you want to use PR-MOE, please set 'use_residual' to True. "
-            "Otherwise, we'll force 'use_residual' to True.",
-            ranks=[0])
-
-
-@MODELS.register_module
-def prmoe_4b(**kwargs):
-    _prmoe_check_sanity(kwargs)
-    model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64],
-                        use_residual=True,
-                        dim=1024,
-                        depth=24,
-                        num_heads=16,
-                        **kwargs)
-    return _create_moegpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def prmoe_31b(**kwargs):
-    _prmoe_check_sanity(kwargs)
-    model_kwargs = dict(num_experts=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128],
-                        use_residual=True,
-                        dim=2048,
-                        depth=24,
-                        num_heads=16,
-                        **kwargs)
-    return _create_moegpt_model(**model_kwargs)
-
-
-@MODELS.register_module
-def prmoe_51b(**kwargs):
-    _prmoe_check_sanity(kwargs)
-    model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64],
-                        use_residual=True,
-                        dim=3072,
-                        depth=32,
-                        num_heads=24,
-                        **kwargs)
-    return _create_moegpt_model(**model_kwargs)
--- a/model_zoo/moe/models.py
+++ b/model_zoo/moe/models.py
@ -1,226 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from colossalai.context import ParallelMode
-from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \
-    WrappedDropout as Dropout, WrappedDropPath as DropPath
-from colossalai.nn.layer.moe import build_ffn_experts, MoeLayer, Top2Router, NormalNoiseGenerator, MoeModule
-from .util import moe_sa_args, moe_mlp_args
-from ..helper import TransformerLayer
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.utils import get_current_device
-from typing import List
-
-
-class VanillaSelfAttention(nn.Module):
-    """Standard ViT self attention.
-    """
-
-    def __init__(self,
-                 d_model: int,
-                 n_heads: int,
-                 d_kv: int,
-                 attention_drop: float = 0,
-                 drop_rate: float = 0,
-                 bias: bool = True,
-                 dropout1=None,
-                 dropout2=None):
-        super().__init__()
-        self.n_heads = n_heads
-        self.d_kv = d_kv
-        self.scale = 1.0 / math.sqrt(self.d_kv)
-
-        self.dense1 = nn.Linear(d_model, 3 * n_heads * d_kv, bias, device=get_current_device())
-        self.softmax = nn.Softmax(dim=-1)
-        self.atten_drop = nn.Dropout(attention_drop) if dropout1 is None else dropout1
-        self.dense2 = nn.Linear(n_heads * d_kv, d_model, device=get_current_device())
-        self.dropout = nn.Dropout(drop_rate) if dropout2 is None else dropout2
-
-    def forward(self, x):
-        qkv = self.dense1(x)
-        new_shape = qkv.shape[:2] + (3, self.n_heads, self.d_kv)
-        qkv = qkv.view(*new_shape)
-        qkv = qkv.permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[:]
-
-        x = torch.matmul(q, k.transpose(-2, -1)) * self.scale
-        x = self.atten_drop(self.softmax(x))
-
-        x = torch.matmul(x, v)
-        x = x.transpose(1, 2)
-        new_shape = x.shape[:2] + (self.n_heads * self.d_kv,)
-        x = x.reshape(*new_shape)
-        x = self.dense2(x)
-        x = self.dropout(x)
-
-        return x
-
-
-class VanillaFFN(nn.Module):
-    """FFN composed with two linear layers, also called MLP.
-    """
-
-    def __init__(self,
-                 d_model: int,
-                 d_ff: int,
-                 activation=None,
-                 drop_rate: float = 0,
-                 bias: bool = True,
-                 dropout1=None,
-                 dropout2=None):
-        super().__init__()
-        dense1 = nn.Linear(d_model, d_ff, bias, device=get_current_device())
-        act = nn.GELU() if activation is None else activation
-        dense2 = nn.Linear(d_ff, d_model, bias, device=get_current_device())
-        drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1
-        drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2
-
-        self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2)
-
-    def forward(self, x):
-        return self.ffn(x)
-
-
-class Widenet(nn.Module):
-
-    def __init__(self,
-                 num_experts: int,
-                 capacity_factor_train: float = 1.25,
-                 capacity_factor_eval: float = 2.0,
-                 drop_tks: bool = True,
-                 img_size: int = 224,
-                 patch_size: int = 16,
-                 in_chans: int = 3,
-                 num_classes: int = 1000,
-                 depth: int = 12,
-                 d_model: int = 768,
-                 num_heads: int = 12,
-                 d_kv: int = 64,
-                 d_ff: int = 4096,
-                 attention_drop: float = 0.,
-                 drop_rate: float = 0.1,
-                 drop_path: float = 0.):
-        super().__init__()
-
-        embedding = VanillaPatchEmbedding(img_size=img_size,
-                                          patch_size=patch_size,
-                                          in_chans=in_chans,
-                                          embed_size=d_model)
-        embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
-
-        shared_sa = VanillaSelfAttention(**moe_sa_args(
-            d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
-
-        noisy_func = NormalNoiseGenerator(num_experts)
-        shared_router = Top2Router(capacity_factor_train=capacity_factor_train,
-                                   capacity_factor_eval=capacity_factor_eval,
-                                   noisy_func=noisy_func,
-                                   drop_tks=drop_tks)
-        shared_experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate)
-
-        # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
-        blocks = [
-            TransformerLayer(att=shared_sa,
-                             ffn=MoeLayer(dim_model=d_model,
-                                          num_experts=num_experts,
-                                          router=shared_router,
-                                          experts=shared_experts),
-                             norm1=nn.LayerNorm(d_model, eps=1e-6),
-                             norm2=nn.LayerNorm(d_model, eps=1e-6),
-                             droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) for i in range(depth)
-        ]
-        norm = nn.LayerNorm(d_model, eps=1e-6)
-        self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes)
-        nn.init.zeros_(self.linear.weight)
-        nn.init.zeros_(self.linear.bias)
-        self.widenet = nn.Sequential(embedding, embed_dropout, *blocks, norm)
-
-    def forward(self, x):
-        MOE_CONTEXT.reset_loss()
-        x = self.widenet(x)
-        x = torch.mean(x, dim=1)
-        x = self.linear(x)
-        return x
-
-
-class ViTMoE(nn.Module):
-
-    def __init__(self,
-                 num_experts: int or List[int],
-                 use_residual: bool = False,
-                 capacity_factor_train: float = 1.25,
-                 capacity_factor_eval: float = 2.0,
-                 drop_tks: bool = True,
-                 img_size: int = 224,
-                 patch_size: int = 16,
-                 in_chans: int = 3,
-                 num_classes: int = 1000,
-                 depth: int = 12,
-                 d_model: int = 768,
-                 num_heads: int = 12,
-                 d_kv: int = 64,
-                 d_ff: int = 3072,
-                 attention_drop: float = 0.,
-                 drop_rate: float = 0.1,
-                 drop_path: float = 0.):
-        super().__init__()
-
-        assert depth % 2 == 0, "The number of layers should be even right now"
-
-        if isinstance(num_experts, list):
-            assert len(num_experts) == depth // 2, \
-                "The length of num_experts should equal to the number of MOE layers"
-            num_experts_list = num_experts
-        else:
-            num_experts_list = [num_experts] * (depth // 2)
-
-        embedding = VanillaPatchEmbedding(img_size=img_size,
-                                          patch_size=patch_size,
-                                          in_chans=in_chans,
-                                          embed_size=d_model)
-        embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR)
-
-        # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
-        blocks = []
-        for i in range(depth):
-            sa = VanillaSelfAttention(**moe_sa_args(
-                d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate))
-
-            if i % 2 == 0:
-                ffn = VanillaFFN(**moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate))
-            else:
-                num_experts = num_experts_list[i // 2]
-                experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate)
-                ffn = MoeModule(dim_model=d_model,
-                                num_experts=num_experts,
-                                top_k=1 if use_residual else 2,
-                                capacity_factor_train=capacity_factor_train,
-                                capacity_factor_eval=capacity_factor_eval,
-                                noisy_policy='Jitter' if use_residual else 'Gaussian',
-                                drop_tks=drop_tks,
-                                use_residual=use_residual,
-                                expert_instance=experts,
-                                expert_cls=VanillaFFN,
-                                **moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate))
-
-            layer = TransformerLayer(att=sa,
-                                     ffn=ffn,
-                                     norm1=nn.LayerNorm(d_model, eps=1e-6),
-                                     norm2=nn.LayerNorm(d_model, eps=1e-6),
-                                     droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR))
-            blocks.append(layer)
-
-        norm = nn.LayerNorm(d_model, eps=1e-6)
-        self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes)
-        nn.init.zeros_(self.linear.weight)
-        nn.init.zeros_(self.linear.bias)
-        self.vitmoe = nn.Sequential(embedding, embed_dropout, *blocks, norm)
-
-    def forward(self, x):
-        MOE_CONTEXT.reset_loss()
-        x = self.vitmoe(x)
-        x = torch.mean(x, dim=1)
-        x = self.linear(x)
-        return x
--- a/model_zoo/moe/util.py
+++ b/model_zoo/moe/util.py
@ -1,41 +0,0 @@
-from colossalai.context import ParallelMode
-from colossalai.nn.layer import WrappedDropout as Dropout
-
-
-def moe_sa_args(d_model: int,
-                n_heads: int,
-                d_kv: int,
-                attention_drop: float = 0,
-                drop_rate: float = 0,
-                bias: bool = True):
-    """This is an example for args in moe self attention, since lots of modules should be
-    adapted before putting them in experts.
-    """
-    dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR)
-    dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
-    return dict(
-        d_model=d_model,
-        n_heads=n_heads,
-        d_kv=d_kv,
-        bias=bias,
-        dropout1=dropout1,
-        dropout2=dropout2
-    )
-
-
-def moe_mlp_args(d_model: int,
-                 d_ff: int,
-                 drop_rate: float,
-                 bias: bool = True):
-    """This is an example for args of MLP in Experts, since lots of modules should be adapted
-    before putting them in experts.
-    """
-    dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
-    dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
-    return dict(
-        d_model=d_model,
-        d_ff=d_ff,
-        bias=bias,
-        dropout1=dropout1,
-        dropout2=dropout2
-    )
--- a/model_zoo/vit/init.py
+++ b/model_zoo/vit/init.py
@ -1 +0,0 @@
-from .vit import *
--- a/model_zoo/vit/vision_transformer_from_config.py
+++ b/model_zoo/vit/vision_transformer_from_config.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch
-
-from colossalai.registry import MODELS
-from colossalai.nn.model.model_from_config import ModelFromConfig
-
-
-@MODELS.register_module
-class VisionTransformerFromConfig(ModelFromConfig):
-    """Vision Transformer from 
-    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/pdf/2010.11929>`_.
-
-    """
-
-    def __init__(self,
-                 embedding_cfg: dict,
-                 norm_cfg: dict,
-                 block_cfg: dict,
-                 head_cfg: dict,
-                 token_fusion_cfg: dict = None,
-                 embed_dim=768,
-                 depth=12,
-                 drop_path_rate=0.,
-                 tensor_splitting_cfg: dict = None):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_tokens = 1
-        self.tensor_splitting_cfg = tensor_splitting_cfg
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
-               ]  # stochastic depth decay rule
-        if token_fusion_cfg is None:
-            token_fusion_cfg = []
-        else:
-            token_fusion_cfg = [token_fusion_cfg]
-
-        self.layers_cfg = [
-            embedding_cfg,
-
-            # input tensor splitting
-            *self._generate_tensor_splitting_cfg(),
-            *token_fusion_cfg,
-
-            # blocks
-            *self._generate_block_cfg(
-                dpr=dpr, block_cfg=block_cfg, depth=depth),
-
-            # norm
-            norm_cfg,
-
-            # head
-            head_cfg
-        ]
-
-    def _fuse_tokens(self, x):
-        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
-        x = torch.cat((cls_token, x), dim=1)
-        return x
-
-    def _generate_block_cfg(self, dpr, depth, block_cfg):
-        blocks_cfg = []
-
-        for i in range(depth):
-            _cfg = block_cfg.copy()
-            _cfg['droppath_cfg']['drop_path'] = dpr[i]
-            blocks_cfg.append(_cfg)
-
-        return blocks_cfg
-
-    def _generate_tensor_splitting_cfg(self):
-        if self.tensor_splitting_cfg:
-            return [self.tensor_splitting_cfg]
-        else:
-            return []
-
-    def forward(self, x):  # [512, 3, 32, 32]
-        for layer in self.layers:
-            if isinstance(x, tuple):
-                x = layer(*x)
-            else:
-                x = layer(x)
-        return x  # [256, 5]
-
-    def init_weights(self):
-        # TODO: add init weights
-        pass
--- a/model_zoo/vit/vit.py
+++ b/model_zoo/vit/vit.py
@ -1,415 +0,0 @@
-import math
-from typing import Callable
-
-import torch
-from colossalai import nn as col_nn
-from colossalai.nn.layer.utils import CheckpointModule
-from colossalai.registry import LAYERS, MODELS
-from torch import dtype, nn
-
-__all__ = [
-    'VisionTransformer',
-    'vit_lite_depth7_patch4_32',
-    'vit_tiny_patch4_32',
-    'vit_tiny_patch16_224',
-    'vit_tiny_patch16_384',
-    'vit_small_patch16_224',
-    'vit_small_patch16_384',
-    'vit_small_patch32_224',
-    'vit_small_patch32_384',
-    'vit_base_patch16_224',
-    'vit_base_patch16_384',
-    'vit_base_patch32_224',
-    'vit_base_patch32_384',
-    'vit_large_patch16_224',
-    'vit_large_patch16_384',
-    'vit_large_patch32_224',
-    'vit_large_patch32_384',
-]
-
-_init_rules = dict(
-    torch=dict(
-        embed=dict(
-            weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
-            bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
-            position_embed_initializer=col_nn.init.zeros_(),
-        ),
-        transformer=dict(
-            weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
-            bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
-        ),
-        head=dict(
-            weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)),
-            bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1),
-        ),
-    ),
-    jax=dict(
-        embed=dict(
-            weight_initializer=col_nn.init.lecun_normal_(),
-            bias_initializer=col_nn.init.zeros_(),
-            position_embed_initializer=col_nn.init.trunc_normal_(std=.02),
-        ),
-        transformer=dict(
-            weight_initializer=col_nn.init.xavier_uniform_(),
-            bias_initializer=col_nn.init.normal_(std=1e-6),
-        ),
-        head=dict(
-            weight_initializer=col_nn.init.zeros_(),
-            bias_initializer=col_nn.init.zeros_(),
-        ),
-    ),
-)
-
-
-@LAYERS.register_module
-class ViTEmbedding(nn.Module):
-    def __init__(self,
-                 img_size: int,
-                 patch_size: int,
-                 in_chans: int,
-                 embedding_dim: int,
-                 dropout: float,
-                 dtype: dtype = None,
-                 flatten: bool = True,
-                 init_method: str = 'torch'):
-        super().__init__()
-        self.patch_embed = col_nn.PatchEmbedding(img_size,
-                                                 patch_size,
-                                                 in_chans,
-                                                 embedding_dim,
-                                                 dtype=dtype,
-                                                 flatten=flatten,
-                                                 **_init_rules[init_method]['embed'])
-        self.dropout = col_nn.Dropout(dropout)
-
-    def forward(self, x):
-        x = self.patch_embed(x)
-        x = self.dropout(x)
-        return x
-
-
-@LAYERS.register_module
-class ViTSelfAttention(nn.Module):
-    def __init__(self,
-                 dim: int,
-                 num_heads: int,
-                 attention_dropout: float,
-                 dropout: float,
-                 bias: bool = True,
-                 dtype: dtype = None,
-                 init_method: str = 'torch'):
-        super().__init__()
-        self.attention_head_size = dim // num_heads
-        self.query_key_value = col_nn.Linear(dim,
-                                             3 * dim,
-                                             dtype=dtype,
-                                             bias=bias,
-                                             **_init_rules[init_method]['transformer'])
-        self.attention_dropout = col_nn.Dropout(attention_dropout)
-        self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True, **_init_rules[init_method]['transformer'])
-        self.dropout = col_nn.Dropout(dropout)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x):
-        qkv = self.query_key_value(x)
-        all_head_size = qkv.shape[-1] // 3
-        num_attention_heads = all_head_size // self.attention_head_size
-        new_qkv_shape = qkv.shape[:-1] + \
-            (num_attention_heads, 3 * self.attention_head_size)
-        qkv = qkv.view(new_qkv_shape)
-        qkv = qkv.permute((0, 2, 1, 3))
-        q, k, v = torch.chunk(qkv, 3, dim=-1)
-
-        x = torch.matmul(q, k.transpose(-1, -2))
-        x = x / math.sqrt(self.attention_head_size)
-        x = self.softmax(x)
-        x = self.attention_dropout(x)
-
-        x = torch.matmul(x, v)
-        x = x.transpose(1, 2)
-        new_context_layer_shape = x.size()[:-2] + (all_head_size, )
-        x = x.reshape(new_context_layer_shape)
-
-        x = self.dense(x)
-        x = self.dropout(x)
-
-        return x
-
-
-@LAYERS.register_module
-class ViTMLP(nn.Module):
-    def __init__(self,
-                 dim: int,
-                 mlp_ratio: int,
-                 activation: Callable,
-                 dropout: float,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 init_method: str = 'torch'):
-        super().__init__()
-        self.dense_1 = col_nn.Linear(dim,
-                                     mlp_ratio * dim,
-                                     dtype=dtype,
-                                     bias=bias,
-                                     **_init_rules[init_method]['transformer'])
-        self.activation = activation
-        self.dropout_1 = col_nn.Dropout(dropout)
-        self.dense_2 = col_nn.Linear(mlp_ratio * dim,
-                                     dim,
-                                     dtype=dtype,
-                                     bias=bias,
-                                     **_init_rules[init_method]['transformer'])
-        self.dropout_2 = col_nn.Dropout(dropout)
-
-    def forward(self, x):
-        x = self.dense_1(x)
-        x = self.activation(x)
-        x = self.dropout_1(x)
-        x = self.dense_2(x)
-        x = self.dropout_2(x)
-        return x
-
-
-@LAYERS.register_module
-class ViTHead(nn.Module):
-    def __init__(self,
-                 dim: int,
-                 num_classes: int,
-                 representation_size: int = None,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 init_method: str = 'torch'):
-        super().__init__()
-        if representation_size:
-            self.representation = col_nn.Linear(dim,
-                                                representation_size,
-                                                bias=bias,
-                                                dtype=dtype,
-                                                **_init_rules[init_method]['head'])
-        else:
-            self.representation = None
-            representation_size = dim
-
-        self.dense = col_nn.Classifier(representation_size,
-                                       num_classes,
-                                       dtype=dtype,
-                                       bias=bias,
-                                       **_init_rules[init_method]['head'])
-
-    def forward(self, x):
-        x = x[:, 0]
-        if self.representation is not None:
-            x = self.representation(x)
-        x = self.dense(x)
-        return x
-
-
-@LAYERS.register_module
-class ViTBlock(CheckpointModule):
-    def __init__(self,
-                 dim: int,
-                 num_heads: int,
-                 mlp_ratio: int,
-                 activation: Callable,
-                 attention_dropout: float = 0.,
-                 dropout: float = 0.,
-                 drop_path: float = 0.,
-                 layernorm_epsilon: float = 1e-6,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 checkpoint: bool = False,
-                 init_method: str = 'torch'):
-        super().__init__(checkpoint)
-        self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-        self.attn = ViTSelfAttention(dim=dim,
-                                     num_heads=num_heads,
-                                     attention_dropout=attention_dropout,
-                                     dropout=dropout,
-                                     bias=bias,
-                                     dtype=dtype,
-                                     init_method=init_method)
-        self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-        self.mlp = ViTMLP(dim=dim,
-                          mlp_ratio=mlp_ratio,
-                          activation=activation,
-                          dropout=dropout,
-                          dtype=dtype,
-                          bias=bias,
-                          init_method=init_method)
-
-    def _forward(self, x):
-        x = x + self.drop_path(self.attn(self.norm1(x)))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-
-
-@MODELS.register_module
-class VisionTransformer(nn.Module):
-    def __init__(self,
-                 img_size: int = 224,
-                 patch_size: int = 16,
-                 in_chans: int = 3,
-                 num_classes: int = 1000,
-                 depth: int = 12,
-                 num_heads: int = 12,
-                 dim: int = 768,
-                 mlp_ratio: int = 4,
-                 attention_dropout: float = 0.,
-                 dropout: float = 0.1,
-                 drop_path: float = 0.,
-                 layernorm_epsilon: float = 1e-6,
-                 activation: Callable = nn.functional.gelu,
-                 representation_size: int = None,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 checkpoint: bool = False,
-                 init_method: str = 'torch'):
-        super().__init__()
-
-        embed = ViTEmbedding(img_size=img_size,
-                             patch_size=patch_size,
-                             in_chans=in_chans,
-                             embedding_dim=dim,
-                             dropout=dropout,
-                             dtype=dtype,
-                             init_method=init_method)
-
-        # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
-        blocks = [
-            ViTBlock(
-                dim=dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                attention_dropout=attention_dropout,
-                dropout=dropout,
-                drop_path=dpr[i],
-                activation=activation,
-                dtype=dtype,
-                bias=bias,
-                checkpoint=checkpoint,
-                init_method=init_method,
-            ) for i in range(depth)
-        ]
-
-        norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype)
-
-        head = ViTHead(dim=dim,
-                       num_classes=num_classes,
-                       representation_size=representation_size,
-                       dtype=dtype,
-                       bias=bias,
-                       init_method=init_method)
-
-        self.layers = nn.Sequential(
-            embed,
-            *blocks,
-            norm,
-            head,
-        )
-
-    def forward(self, x):
-        x = self.layers(x)
-        return x
-
-
-def _create_vit_model(**model_kwargs):
-    model = VisionTransformer(**model_kwargs)
-    return model
-
-
-@MODELS.register_module
-def vit_lite_depth7_patch4_32(**kwargs):
-    model_kwargs = dict(img_size=32, patch_size=4, dim=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_tiny_patch4_32(**kwargs):
-    model_kwargs = dict(img_size=32, patch_size=4, dim=512, depth=6, num_heads=8, mlp_ratio=1, num_classes=10, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_tiny_patch16_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_tiny_patch16_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_small_patch16_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_small_patch16_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_small_patch32_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_small_patch32_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_base_patch16_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_base_patch16_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_base_patch32_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_base_patch32_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_large_patch16_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_large_patch16_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_large_patch32_224(**kwargs):
-    model_kwargs = dict(img_size=224, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
-
-
-@MODELS.register_module
-def vit_large_patch32_384(**kwargs):
-    model_kwargs = dict(img_size=384, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs)
-    return _create_vit_model(**model_kwargs)
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@ -1,3 +1,4 @@
 pytest
 torchvision
 transformers
+titans
--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
@ -1,4 +1,5 @@
 import os
+
 from functools import partial
 from pathlib import Path

@ -6,19 +7,21 @@ import colossalai
 import pytest
 import torch
 import torch.multiprocessing as mp
-from colossalai.amp.amp_type import AMP_TYPE
-from colossalai.builder import build_pipeline_model
-from colossalai.engine.schedule import PipelineSchedule
-from colossalai.logging import get_dist_logger
-from colossalai.nn import LinearWarmupLR
-from colossalai.nn.loss import CrossEntropyLoss
+from colossalai.amp import AMP_TYPE
 from colossalai.trainer import Trainer, hooks
-from colossalai.utils import free_port, get_dataloader
-from colossalai.engine.gradient_accumulation import GradAccumLrSchedulerByStep
+from colossalai.context import ParallelMode
 from colossalai.testing import rerun_if_address_is_in_use
-from model_zoo.vit import vit_tiny_patch4_32
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
+from colossalai.utils import free_port
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.nn import CrossEntropyLoss
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.utils import is_using_pp, get_dataloader
+from colossalai.utils.model.pipelinable import PipelinableContext
+from tqdm import tqdm
+
+from titans.dataloader.cifar10 import build_cifar
+from titans.model.vit import vit_tiny_patch4_32

 BATCH_SIZE = 4
 NUM_EPOCHS = 60
@ -34,35 +37,35 @@ def run_trainer(rank, world_size, port):

    logger = get_dist_logger()

-    model = vit_tiny_patch4_32()
-    pipe_model = build_pipeline_model(model.layers, num_chunks=1)
+    # get logger
+    logger = get_dist_logger()

-    # build dataloaders
-    transform_train = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-    ])
+    pipelinable = PipelinableContext()
+    with pipelinable:
+        model = vit_tiny_patch4_32()
+    pipelinable.to_layer_list()
+    pipelinable.load_policy("uniform")
+    model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))

-    train_dataset = CIFAR10(root=Path(os.environ['DATA']), train=True, download=True, transform=transform_train)
-    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True)
+    # craete dataloaders
+    root = Path(os.environ['DATA'])
+    train_dataloader, test_dataloader = build_cifar(BATCH_SIZE, root, pad_if_needed=True, crop=32, resize=32)

-    # build criterion
-    criterion = CrossEntropyLoss()
+    # create loss function
+    criterion = CrossEntropyLoss(label_smoothing=0.1)

-    # optimizer
-    optimizer = torch.optim.Adam(pipe_model.parameters(), lr=0.001, weight_decay=0)
+    # create optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)

-    # lr_scheduler
-    steps_per_epoch = GradAccumLrSchedulerByStep.compute_effective_steps_per_epoch(train_dataloader, accumulate_size=2)
-    total_steps = steps_per_epoch * NUM_EPOCHS
-    warmup_steps = steps_per_epoch * WARMUP_EPOCHS
-    lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps)
+    # create lr scheduler
+    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)

-    engine, train_dataloader, _, lr_scheduler = colossalai.initialize(pipe_model,
-                                                                      optimizer,
-                                                                      criterion,
-                                                                      train_dataloader,
-                                                                      lr_scheduler=lr_scheduler)
+    # intiailize
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model,
+                                                                         optimizer=optimizer,
+                                                                         criterion=criterion,
+                                                                         train_dataloader=train_dataloader,
+                                                                         test_dataloader=test_dataloader)

    logger = get_dist_logger()