From 9feff0f7603fcfc6f67150f37f051a24c9fada02 Mon Sep 17 00:00:00 2001 From: YuliangLiu0306 <72588413+YuliangLiu0306@users.noreply.github.com> Date: Tue, 31 May 2022 10:40:47 +0800 Subject: [PATCH] [titans]remove model zoo (#1042) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * rm model zoo --- model_zoo/__init__.py | 0 model_zoo/gpt/__init__.py | 1 - model_zoo/gpt/gpt.py | 478 ------------------ model_zoo/helper.py | 26 - model_zoo/moe/__init__.py | 2 - model_zoo/moe/gpt.py | 229 --------- model_zoo/moe/models.py | 226 --------- model_zoo/moe/util.py | 41 -- model_zoo/vit/__init__.py | 1 - .../vit/vision_transformer_from_config.py | 87 ---- model_zoo/vit/vit.py | 415 --------------- requirements/requirements-test.txt | 1 + .../test_cifar_with_data_pipeline_tensor.py | 71 +-- 13 files changed, 38 insertions(+), 1540 deletions(-) delete mode 100644 model_zoo/__init__.py delete mode 100644 model_zoo/gpt/__init__.py delete mode 100644 model_zoo/gpt/gpt.py delete mode 100644 model_zoo/helper.py delete mode 100644 model_zoo/moe/__init__.py delete mode 100644 model_zoo/moe/gpt.py delete mode 100644 model_zoo/moe/models.py delete mode 100644 model_zoo/moe/util.py delete mode 100644 model_zoo/vit/__init__.py delete mode 100644 model_zoo/vit/vision_transformer_from_config.py delete mode 100644 model_zoo/vit/vit.py diff --git a/model_zoo/__init__.py b/model_zoo/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/model_zoo/gpt/__init__.py b/model_zoo/gpt/__init__.py deleted file mode 100644 index 5a20f0f81..000000000 --- a/model_zoo/gpt/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .gpt import * \ No newline at end of file diff --git a/model_zoo/gpt/gpt.py b/model_zoo/gpt/gpt.py deleted file mode 100644 index f684316b6..000000000 --- a/model_zoo/gpt/gpt.py +++ /dev/null @@ -1,478 +0,0 @@ -import math -from typing import Callable - -import torch -from colossalai import nn as col_nn -from colossalai.builder.pipeline import partition_uniform -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_dist_logger -from colossalai.nn.layer.utils import CheckpointModule, divide -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper -from colossalai.registry import LAYERS, LOSSES, MODELS -from colossalai.utils import get_current_device -from torch import dtype, nn - -__all__ = [ - 'GPT', 'GPTLMLoss', 'gpt2_small', 'gpt2_medium', 'gpt2_large', 'gpt2_xl', 'gpt2_8B', 'gpt2_xl_pipeline', - 'gpt2_8B_pipeline', 'gpt3', 'gpt3_pipeline' -] - - -@LAYERS.register_module -class GPTEmbedding(nn.Module): - - def __init__(self, - embedding_dim: int, - vocab_size: int, - max_position_embeddings: int, - num_tokentypes: int = 0, - padding_idx: int = None, - dropout: float = 0., - dtype: dtype = None) -> None: - super().__init__() - self.word_embeddings = col_nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx, dtype=dtype) - self.position_embeddings = col_nn.Embedding(max_position_embeddings, embedding_dim, dtype=dtype) - if num_tokentypes > 0: - self.tokentype_embeddings = col_nn.Embedding(num_tokentypes, embedding_dim, dtype=dtype) - else: - self.tokentype_embeddings = None - self.dropout = col_nn.Dropout(dropout) - - @property - def word_embedding_weight(self): - return self.word_embeddings.weight - - def forward(self, input_ids, position_ids=None, tokentype_ids=None): - seq_length = input_ids.size(1) - if position_ids is None: - position_ids = torch.arange(seq_length, dtype=torch.long, device=get_current_device()).unsqueeze(0) - x = self.word_embeddings(input_ids) + self.position_embeddings(position_ids) - if self.tokentype_embeddings is not None and tokentype_ids is not None: - x = x + self.tokentype_embeddings(tokentype_ids) - x = self.dropout(x) - - return x - - -@LAYERS.register_module -class GPTSelfAttention(nn.Module): - - def __init__(self, - dim: int, - num_heads: int, - attention_dropout: float, - dropout: float, - bias: bool = True, - fuse_scale_mask_softmax: bool = False, - dtype: dtype = None) -> None: - super().__init__() - self.fuse_scale_mask_softmax = fuse_scale_mask_softmax - self.attention_head_size = divide(dim, num_heads) - self.query_key_value = col_nn.Linear(dim, 3 * dim, dtype=dtype, bias=bias) - if fuse_scale_mask_softmax: - from colossalai.kernel import FusedScaleMaskSoftmax - from colossalai.kernel.cuda_native.scaled_softmax import \ - AttnMaskType - self.softmax = FusedScaleMaskSoftmax(input_in_fp16=True, - input_in_bf16=False, - attn_mask_type=AttnMaskType.causal, - scaled_masked_softmax_fusion=True, - mask_func=None, - softmax_in_fp32=True, - scale=math.sqrt(self.attention_head_size)) - else: - self.softmax = nn.Softmax(dim=-1) - self.attention_dropout = col_nn.Dropout(attention_dropout) - self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True) - self.dropout = col_nn.Dropout(dropout) - - def forward(self, x, attention_mask=None): - qkv = self.query_key_value(x) - q, k, v = torch.chunk(qkv, 3, dim=-1) - all_head_size = q.shape[-1] - num_attention_heads = divide(all_head_size, self.attention_head_size) - new_shape = q.shape[:-1] + \ - (num_attention_heads, self.attention_head_size) - q = q.view(new_shape).permute((0, 2, 1, 3)).contiguous() - k = k.view(new_shape).permute((0, 2, 1, 3)).contiguous() - v = v.view(new_shape).permute((0, 2, 1, 3)).contiguous() - - x = torch.matmul(q, k.transpose(-1, -2)) - - if self.fuse_scale_mask_softmax: - x = self.softmax(x, attention_mask) - else: - x = x / math.sqrt(self.attention_head_size) - # causal mask - q_len, k_len = q.size(-2), k.size(-2) - causal_mask = torch.tril(torch.ones((q_len, k_len), dtype=torch.uint8, - device=get_current_device())).view(1, 1, q_len, k_len).bool() - x = torch.where(causal_mask, x, torch.tensor(-1e4, dtype=x.dtype, device=get_current_device())) - if attention_mask is not None: - x = x + attention_mask - x = self.softmax(x) - - x = self.attention_dropout(x) - - x = torch.matmul(x, v) - x = x.transpose(1, 2) - new_context_layer_shape = x.size()[:-2] + (all_head_size,) - x = x.reshape(new_context_layer_shape) - - x = self.dense(x) - x = self.dropout(x) - - return x - - -@LAYERS.register_module -class GPTMLP(nn.Module): - - def __init__(self, - dim: int, - mlp_ratio: float, - activation: Callable, - dropout: float, - dtype: dtype = None, - bias: bool = True): - super().__init__() - intermediate_dim = int(dim * mlp_ratio) - self.dense_1 = col_nn.Linear(dim, intermediate_dim, dtype=dtype, bias=bias) - self.activation = activation - self.dense_2 = col_nn.Linear(intermediate_dim, dim, dtype=dtype, bias=bias) - self.dropout = col_nn.Dropout(dropout) - - def forward(self, x): - x = self.dense_1(x) - x = self.activation(x) - x = self.dense_2(x) - x = self.dropout(x) - return x - - -@LAYERS.register_module -class GPTBlock(CheckpointModule): - - def __init__(self, - dim: int, - num_heads: int, - mlp_ratio: float, - activation: Callable, - attention_dropout: float = 0., - dropout: float = 0., - layernorm_epsilon: float = 1e-5, - dtype: dtype = None, - bias: bool = True, - apply_post_layernorm: bool = False, - fuse_scale_mask_softmax: bool = False, - checkpoint: bool = False, - activation_offload: bool = False): - super().__init__(checkpoint, activation_offload) - self.apply_post_layernorm = apply_post_layernorm - self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - self.attn = GPTSelfAttention(dim=dim, - num_heads=num_heads, - attention_dropout=attention_dropout, - dropout=dropout, - bias=bias, - fuse_scale_mask_softmax=fuse_scale_mask_softmax, - dtype=dtype) - self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - self.mlp = GPTMLP(dim=dim, mlp_ratio=mlp_ratio, activation=activation, dropout=dropout, dtype=dtype, bias=bias) - - def _forward(self, x, attention_mask=None): - if not self.apply_post_layernorm: - residual = x - x = self.norm1(x) - if self.apply_post_layernorm: - residual = x - x = residual + self.attn(x, attention_mask) - - if not self.apply_post_layernorm: - residual = x - x = self.norm2(x) - if self.apply_post_layernorm: - residual = x - x = residual + self.mlp(x) - - return x, attention_mask - - -@LAYERS.register_module -class GPTLMHead(nn.Module): - - def __init__(self, - dim: int, - vocab_size: int, - word_embeeding_weight: nn.Parameter = None, - bias: bool = False, - dtype: dtype = None) -> None: - super().__init__() - self.dense = col_nn.Classifier(dim, vocab_size, word_embeeding_weight, bias=bias, dtype=dtype) - - @property - def weight(self): - return self.dense.weight - - def forward(self, x): - x = self.dense(x) - return x - - -@LOSSES.register_module -class GPTLMLoss(nn.Module): - - def __init__(self): - super().__init__() - self.loss = col_nn.CrossEntropyLoss() - - def forward(self, logits, labels): - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - - -@MODELS.register_module -class GPT(nn.Module): - - def __init__(self, - vocab_size: int = 50304, - max_position_embeddings: int = 1024, - dim: int = 768, - num_heads: int = 12, - depth: int = 12, - mlp_ratio: float = 4.0, - dropout: float = 0.1, - embedding_dropout: float = 0.1, - attention_dropout: float = 0.1, - layernorm_epsilon: float = 1e-5, - activation: Callable = nn.functional.gelu, - padding_idx: int = None, - dtype: dtype = None, - bias: bool = True, - apply_post_layernorm: bool = False, - fuse_scale_mask_softmax: bool = False, - checkpoint: bool = False, - activation_offload: bool = False) -> None: - super().__init__() - self.embed = GPTEmbedding(embedding_dim=dim, - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - padding_idx=padding_idx, - dropout=embedding_dropout, - dtype=dtype) - self.blocks = nn.ModuleList([ - GPTBlock( - dim=dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - activation=activation, - attention_dropout=attention_dropout, - dropout=dropout, - layernorm_epsilon=layernorm_epsilon, - dtype=dtype, - bias=bias, - apply_post_layernorm=apply_post_layernorm, - fuse_scale_mask_softmax=fuse_scale_mask_softmax, - checkpoint=checkpoint, - activation_offload=activation_offload - ) for _ in range(depth) - ]) - - self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - - self.head = GPTLMHead(dim=dim, - vocab_size=vocab_size, - word_embeeding_weight=self.embed.word_embedding_weight, - dtype=dtype) - - def forward(self, input_ids, attention_mask=None): - x = self.embed(input_ids) - - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # Adapted from huggingface - if attention_mask is not None: - batch_size = input_ids.shape[0] - attention_mask = attention_mask.view(batch_size, -1) - attention_mask = col_nn.partition_batch(attention_mask) - attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * -10000.0 - - for block in self.blocks: - x, attention_mask = block(x, attention_mask) - - x = self.head(self.norm(x)) - - return x - - -class PipelineGPT(nn.Module): - - def __init__(self, - vocab_size: int = 50304, - max_position_embeddings: int = 1024, - dim: int = 768, - num_heads: int = 12, - depth: int = 12, - mlp_ratio: float = 4.0, - dropout: float = 0.1, - embedding_dropout: float = 0.1, - attention_dropout: float = 0.1, - layernorm_epsilon: float = 1e-5, - activation: Callable = nn.functional.gelu, - padding_idx: int = None, - dtype: dtype = None, - bias: bool = True, - apply_post_layernorm: bool = False, - fuse_scale_mask_softmax: bool = False, - checkpoint: bool = False, - first: bool = False, - last: bool = False): - super().__init__() - self.checkpoint = checkpoint - self.first = first - self.last = last - if first: - self.embed = GPTEmbedding(embedding_dim=dim, - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - padding_idx=padding_idx, - dropout=embedding_dropout, - dtype=dtype) - self.blocks = nn.ModuleList([ - GPTBlock( - dim=dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - activation=activation, - attention_dropout=attention_dropout, - dropout=dropout, - layernorm_epsilon=layernorm_epsilon, - dtype=dtype, - bias=bias, - apply_post_layernorm=apply_post_layernorm, - fuse_scale_mask_softmax=fuse_scale_mask_softmax, - checkpoint=checkpoint, - ) for _ in range(depth) - ]) - if self.last: - self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - self.head = GPTLMHead(dim=dim, vocab_size=vocab_size, dtype=dtype) - - def forward(self, x=None, input_ids=None, attention_mask=None): - if self.first: - x = self.embed(input_ids) - - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # Adapted from huggingface - if attention_mask is not None: - if self.first: - batch_size = input_ids.shape[0] - else: - batch_size = x.shape[0] - attention_mask = attention_mask.view(batch_size, -1) - attention_mask = col_nn.partition_batch(attention_mask) - attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * -10000.0 - - for block in self.blocks: - x, attention_mask = block(x, attention_mask) - - if self.last: - x = self.head(self.norm(x)) - - return x - - -def _create_gpt_model(**model_kwargs): - model = GPT(**model_kwargs) - return model - - -def _create_gpt_pipeline_model(depth=48, num_chunks=1, layer_partitions=None, **model_kwargs): - logger = get_dist_logger() - pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE) - pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE) - rank = gpc.get_global_rank() - wrapper = PipelineSharedModuleWrapper([0, pipeline_size - 1]) - parts = partition_uniform(depth, pipeline_size, - num_chunks)[pipeline_rank] if layer_partitions is None else layer_partitions - models = [] - for start, end in parts: - model_kwargs['first'] = start == 0 - model_kwargs['last'] = end == depth - model_kwargs['depth'] = end - start - chunk = PipelineGPT(**model_kwargs).to(get_current_device()) - if start == 0: - wrapper.register_parameter(chunk.embed.word_embedding_weight) - elif end == depth: - wrapper.register_parameter(chunk.head.weight) - models.append(chunk) - logger.info(f'==> Rank {rank} built layer {start}-{end} / total {depth}') - if len(models) == 1: - model = models[0] - else: - model = nn.ModuleList(models) - return model - - -@MODELS.register_module -def gpt2_small(**kwargs): - model_kwargs = dict(dim=768, depth=12, num_heads=12, **kwargs) - return _create_gpt_model(**model_kwargs) - - -@MODELS.register_module -def gpt2_medium(**kwargs): - model_kwargs = dict(dim=1024, depth=24, num_heads=8, **kwargs) - return _create_gpt_model(**model_kwargs) - - -@MODELS.register_module -def gpt2_large(**kwargs): - model_kwargs = dict(dim=1536, depth=36, num_heads=12, **kwargs) - return _create_gpt_model(**model_kwargs) - - -@MODELS.register_module -def gpt2_xl(**kwargs): - model_kwargs = dict(dim=1600, depth=48, num_heads=16, **kwargs) - return _create_gpt_model(**model_kwargs) - - -@MODELS.register_module -def gpt2_8B(**kwargs): - model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs) - return _create_gpt_model(**model_kwargs) - - -@MODELS.register_module -def gpt2_xl_pipeline(**kwargs): - model_kwargs = dict(dim=1600, depth=48, num_heads=20, **kwargs) - return _create_gpt_pipeline_model(**model_kwargs) - - -@MODELS.register_module -def gpt2_8B_pipeline(**kwargs): - model_kwargs = dict(dim=3072, depth=72, num_heads=24, **kwargs) - return _create_gpt_pipeline_model(**model_kwargs) - - -@MODELS.register_module -def gpt3(**kwargs): - model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs) - return _create_gpt_model(**model_kwargs) - - -@MODELS.register_module -def gpt3_pipeline(**kwargs): - model_kwargs = dict(dim=12288, depth=96, num_heads=96, **kwargs) - return _create_gpt_pipeline_model(**model_kwargs) diff --git a/model_zoo/helper.py b/model_zoo/helper.py deleted file mode 100644 index 0f4fac17c..000000000 --- a/model_zoo/helper.py +++ /dev/null @@ -1,26 +0,0 @@ -import torch -import torch.nn as nn -from colossalai.nn.layer import WrappedDropPath as DropPath - - -class TransformerLayer(nn.Module): - """Transformer layer builder. - """ - def __init__(self, - att: nn.Module, - ffn: nn.Module, - norm1: nn.Module, - norm2: nn.Module, - droppath=None, - droppath_rate: float = 0): - super().__init__() - self.att = att - self.ffn = ffn - self.norm1 = norm1 - self.norm2 = norm2 - self.droppath = DropPath(droppath_rate) if droppath is None else droppath - - def forward(self, x): - x = x + self.droppath(self.att(self.norm1(x))) - x = x + self.droppath(self.ffn(self.norm2(x))) - return x diff --git a/model_zoo/moe/__init__.py b/model_zoo/moe/__init__.py deleted file mode 100644 index e3d055463..000000000 --- a/model_zoo/moe/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .models import Widenet, ViTMoE -from .gpt import MOEGPT, prmoe_4b, prmoe_31b, prmoe_51b diff --git a/model_zoo/moe/gpt.py b/model_zoo/moe/gpt.py deleted file mode 100644 index 35c71505b..000000000 --- a/model_zoo/moe/gpt.py +++ /dev/null @@ -1,229 +0,0 @@ -from typing import Callable, List -from torch import dtype, nn -from colossalai import nn as col_nn -from colossalai.registry import LAYERS, MODELS -from colossalai.nn.layer import MoeModule -from colossalai.context import MOE_CONTEXT -from colossalai.logging import get_dist_logger -from colossalai.nn.layer.utils import CheckpointModule, divide -from model_zoo.gpt.gpt import GPTEmbedding, GPTSelfAttention, GPTMLP, GPTBlock, GPTLMHead - - -@LAYERS.register_module -class MOEGPTBlock(CheckpointModule): - - def __init__(self, - num_experts: int, - dim: int, - num_heads: int, - mlp_ratio: float, - activation: Callable, - capacity_factor_train: float = 1.0, - capacity_factor_eval: float = 1.0, - use_residual: bool = False, - attention_dropout: float = 0., - dropout: float = 0., - layernorm_epsilon: float = 1e-5, - dtype: dtype = None, - bias: bool = True, - apply_post_layernorm: bool = False, - fuse_scale_mask_softmax: bool = False, - checkpoint: bool = False): - super().__init__(checkpoint) - self.apply_post_layernorm = apply_post_layernorm - self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - self.attn = GPTSelfAttention(dim=dim, - num_heads=num_heads, - attention_dropout=attention_dropout, - dropout=dropout, - bias=bias, - fuse_scale_mask_softmax=fuse_scale_mask_softmax, - dtype=dtype) - self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - - mpl_factory_dict = dict(dim=dim, - mlp_ratio=mlp_ratio, - activation=activation, - dropout=dropout, - dtype=dtype, - bias=bias) - - self.mlp = MoeModule(dim_model=dim, - num_experts=num_experts, - top_k=1, - capacity_factor_train=capacity_factor_train, - capacity_factor_eval=capacity_factor_eval, - noisy_policy='Jitter', - use_residual=use_residual, - expert_cls=GPTMLP, - **mpl_factory_dict) - - def _forward(self, x, attention_mask=None): - if not self.apply_post_layernorm: - residual = x - x = self.norm1(x) - if self.apply_post_layernorm: - residual = x - x = residual + self.attn(x, attention_mask) - - if not self.apply_post_layernorm: - residual = x - x = self.norm2(x) - if self.apply_post_layernorm: - residual = x - x = residual + self.mlp(x) - - return x, attention_mask - - -@MODELS.register_module -class MOEGPT(nn.Module): - - def __init__(self, - num_experts: int or List[int], - use_residual: bool = False, - capacity_factor_train: float = 1.0, - capacity_factor_eval: float = 1.0, - vocab_size: int = 50304, - max_position_embeddings: int = 1024, - dim: int = 768, - num_heads: int = 12, - depth: int = 12, - mlp_ratio: float = 4.0, - dropout: float = 0.1, - embedding_dropout: float = 0.1, - attention_dropout: float = 0.1, - layernorm_epsilon: float = 1e-5, - activation: Callable = nn.functional.gelu, - padding_idx: int = None, - dtype: dtype = None, - bias: bool = True, - apply_post_layernorm: bool = False, - fuse_scale_mask_softmax: bool = False, - checkpoint: bool = False) -> None: - super().__init__() - - half_depth = divide(depth, 2) - if isinstance(num_experts, list): - assert len(num_experts) == half_depth, \ - "The length of num_experts should equal to the number of MOE layers" - num_experts_list = num_experts - else: - num_experts_list = [num_experts] * half_depth - - self.embed = GPTEmbedding(embedding_dim=dim, - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - padding_idx=padding_idx, - dropout=embedding_dropout, - dtype=dtype) - - block_list = [] - block_factory_dict = dict(dim=dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - activation=activation, - attention_dropout=attention_dropout, - dropout=dropout, - layernorm_epsilon=layernorm_epsilon, - dtype=dtype, - bias=bias, - apply_post_layernorm=apply_post_layernorm, - fuse_scale_mask_softmax=fuse_scale_mask_softmax, - checkpoint=checkpoint) - - for i in range(depth): - - if i % 2 == 0: - block_module = GPTBlock(**block_factory_dict) - else: - num_experts = num_experts_list[i // 2] - block_module = MOEGPTBlock(num_experts=num_experts, - capacity_factor_train=capacity_factor_train, - capacity_factor_eval=capacity_factor_eval, - use_residual=use_residual, - **block_factory_dict) - - block_list.append(block_module) - - self.blocks = nn.ModuleList(block_list) - - self.norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - - self.head = GPTLMHead(dim=dim, - vocab_size=vocab_size, - word_embeeding_weight=self.embed.word_embedding_weight, - dtype=dtype) - - def forward(self, input_ids, attention_mask=None): - MOE_CONTEXT.reset_loss() - x = self.embed(input_ids) - - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # Adapted from huggingface - if attention_mask is not None: - batch_size = input_ids.shape[0] - attention_mask = attention_mask.view(batch_size, -1) - attention_mask = col_nn.partition_batch(attention_mask) - attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = attention_mask.to(dtype=x.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * -10000.0 - - for block in self.blocks: - x, attention_mask = block(x, attention_mask) - - x = self.head(self.norm(x)) - - return x - - -def _create_moegpt_model(**model_kwargs): - model = MOEGPT(**model_kwargs) - return model - - -def _prmoe_check_sanity(kwargs_dict): - logger = get_dist_logger() - if not kwargs_dict.pop('use_residual', False): - logger.warning( - "If you want to use PR-MOE, please set 'use_residual' to True. " - "Otherwise, we'll force 'use_residual' to True.", - ranks=[0]) - - -@MODELS.register_module -def prmoe_4b(**kwargs): - _prmoe_check_sanity(kwargs) - model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64], - use_residual=True, - dim=1024, - depth=24, - num_heads=16, - **kwargs) - return _create_moegpt_model(**model_kwargs) - - -@MODELS.register_module -def prmoe_31b(**kwargs): - _prmoe_check_sanity(kwargs) - model_kwargs = dict(num_experts=[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 128, 128], - use_residual=True, - dim=2048, - depth=24, - num_heads=16, - **kwargs) - return _create_moegpt_model(**model_kwargs) - - -@MODELS.register_module -def prmoe_51b(**kwargs): - _prmoe_check_sanity(kwargs) - model_kwargs = dict(num_experts=[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64], - use_residual=True, - dim=3072, - depth=32, - num_heads=24, - **kwargs) - return _create_moegpt_model(**model_kwargs) diff --git a/model_zoo/moe/models.py b/model_zoo/moe/models.py deleted file mode 100644 index 9dc273d08..000000000 --- a/model_zoo/moe/models.py +++ /dev/null @@ -1,226 +0,0 @@ -import math -import torch -import torch.nn as nn -from colossalai.context import ParallelMode -from colossalai.nn.layer import VanillaPatchEmbedding, VanillaClassifier, \ - WrappedDropout as Dropout, WrappedDropPath as DropPath -from colossalai.nn.layer.moe import build_ffn_experts, MoeLayer, Top2Router, NormalNoiseGenerator, MoeModule -from .util import moe_sa_args, moe_mlp_args -from ..helper import TransformerLayer -from colossalai.context.moe_context import MOE_CONTEXT -from colossalai.utils import get_current_device -from typing import List - - -class VanillaSelfAttention(nn.Module): - """Standard ViT self attention. - """ - - def __init__(self, - d_model: int, - n_heads: int, - d_kv: int, - attention_drop: float = 0, - drop_rate: float = 0, - bias: bool = True, - dropout1=None, - dropout2=None): - super().__init__() - self.n_heads = n_heads - self.d_kv = d_kv - self.scale = 1.0 / math.sqrt(self.d_kv) - - self.dense1 = nn.Linear(d_model, 3 * n_heads * d_kv, bias, device=get_current_device()) - self.softmax = nn.Softmax(dim=-1) - self.atten_drop = nn.Dropout(attention_drop) if dropout1 is None else dropout1 - self.dense2 = nn.Linear(n_heads * d_kv, d_model, device=get_current_device()) - self.dropout = nn.Dropout(drop_rate) if dropout2 is None else dropout2 - - def forward(self, x): - qkv = self.dense1(x) - new_shape = qkv.shape[:2] + (3, self.n_heads, self.d_kv) - qkv = qkv.view(*new_shape) - qkv = qkv.permute(2, 0, 3, 1, 4) - q, k, v = qkv[:] - - x = torch.matmul(q, k.transpose(-2, -1)) * self.scale - x = self.atten_drop(self.softmax(x)) - - x = torch.matmul(x, v) - x = x.transpose(1, 2) - new_shape = x.shape[:2] + (self.n_heads * self.d_kv,) - x = x.reshape(*new_shape) - x = self.dense2(x) - x = self.dropout(x) - - return x - - -class VanillaFFN(nn.Module): - """FFN composed with two linear layers, also called MLP. - """ - - def __init__(self, - d_model: int, - d_ff: int, - activation=None, - drop_rate: float = 0, - bias: bool = True, - dropout1=None, - dropout2=None): - super().__init__() - dense1 = nn.Linear(d_model, d_ff, bias, device=get_current_device()) - act = nn.GELU() if activation is None else activation - dense2 = nn.Linear(d_ff, d_model, bias, device=get_current_device()) - drop1 = nn.Dropout(drop_rate) if dropout1 is None else dropout1 - drop2 = nn.Dropout(drop_rate) if dropout2 is None else dropout2 - - self.ffn = nn.Sequential(dense1, act, drop1, dense2, drop2) - - def forward(self, x): - return self.ffn(x) - - -class Widenet(nn.Module): - - def __init__(self, - num_experts: int, - capacity_factor_train: float = 1.25, - capacity_factor_eval: float = 2.0, - drop_tks: bool = True, - img_size: int = 224, - patch_size: int = 16, - in_chans: int = 3, - num_classes: int = 1000, - depth: int = 12, - d_model: int = 768, - num_heads: int = 12, - d_kv: int = 64, - d_ff: int = 4096, - attention_drop: float = 0., - drop_rate: float = 0.1, - drop_path: float = 0.): - super().__init__() - - embedding = VanillaPatchEmbedding(img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_size=d_model) - embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR) - - shared_sa = VanillaSelfAttention(**moe_sa_args( - d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate)) - - noisy_func = NormalNoiseGenerator(num_experts) - shared_router = Top2Router(capacity_factor_train=capacity_factor_train, - capacity_factor_eval=capacity_factor_eval, - noisy_func=noisy_func, - drop_tks=drop_tks) - shared_experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate) - - # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] - blocks = [ - TransformerLayer(att=shared_sa, - ffn=MoeLayer(dim_model=d_model, - num_experts=num_experts, - router=shared_router, - experts=shared_experts), - norm1=nn.LayerNorm(d_model, eps=1e-6), - norm2=nn.LayerNorm(d_model, eps=1e-6), - droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) for i in range(depth) - ] - norm = nn.LayerNorm(d_model, eps=1e-6) - self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes) - nn.init.zeros_(self.linear.weight) - nn.init.zeros_(self.linear.bias) - self.widenet = nn.Sequential(embedding, embed_dropout, *blocks, norm) - - def forward(self, x): - MOE_CONTEXT.reset_loss() - x = self.widenet(x) - x = torch.mean(x, dim=1) - x = self.linear(x) - return x - - -class ViTMoE(nn.Module): - - def __init__(self, - num_experts: int or List[int], - use_residual: bool = False, - capacity_factor_train: float = 1.25, - capacity_factor_eval: float = 2.0, - drop_tks: bool = True, - img_size: int = 224, - patch_size: int = 16, - in_chans: int = 3, - num_classes: int = 1000, - depth: int = 12, - d_model: int = 768, - num_heads: int = 12, - d_kv: int = 64, - d_ff: int = 3072, - attention_drop: float = 0., - drop_rate: float = 0.1, - drop_path: float = 0.): - super().__init__() - - assert depth % 2 == 0, "The number of layers should be even right now" - - if isinstance(num_experts, list): - assert len(num_experts) == depth // 2, \ - "The length of num_experts should equal to the number of MOE layers" - num_experts_list = num_experts - else: - num_experts_list = [num_experts] * (depth // 2) - - embedding = VanillaPatchEmbedding(img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_size=d_model) - embed_dropout = Dropout(p=drop_rate, mode=ParallelMode.TENSOR) - - # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] - blocks = [] - for i in range(depth): - sa = VanillaSelfAttention(**moe_sa_args( - d_model=d_model, n_heads=num_heads, d_kv=d_kv, attention_drop=attention_drop, drop_rate=drop_rate)) - - if i % 2 == 0: - ffn = VanillaFFN(**moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate)) - else: - num_experts = num_experts_list[i // 2] - experts = build_ffn_experts(num_experts, d_model, d_ff, drop_rate=drop_rate) - ffn = MoeModule(dim_model=d_model, - num_experts=num_experts, - top_k=1 if use_residual else 2, - capacity_factor_train=capacity_factor_train, - capacity_factor_eval=capacity_factor_eval, - noisy_policy='Jitter' if use_residual else 'Gaussian', - drop_tks=drop_tks, - use_residual=use_residual, - expert_instance=experts, - expert_cls=VanillaFFN, - **moe_mlp_args(d_model=d_model, d_ff=d_ff, drop_rate=drop_rate)) - - layer = TransformerLayer(att=sa, - ffn=ffn, - norm1=nn.LayerNorm(d_model, eps=1e-6), - norm2=nn.LayerNorm(d_model, eps=1e-6), - droppath=DropPath(p=dpr[i], mode=ParallelMode.TENSOR)) - blocks.append(layer) - - norm = nn.LayerNorm(d_model, eps=1e-6) - self.linear = VanillaClassifier(in_features=d_model, num_classes=num_classes) - nn.init.zeros_(self.linear.weight) - nn.init.zeros_(self.linear.bias) - self.vitmoe = nn.Sequential(embedding, embed_dropout, *blocks, norm) - - def forward(self, x): - MOE_CONTEXT.reset_loss() - x = self.vitmoe(x) - x = torch.mean(x, dim=1) - x = self.linear(x) - return x diff --git a/model_zoo/moe/util.py b/model_zoo/moe/util.py deleted file mode 100644 index 60028656e..000000000 --- a/model_zoo/moe/util.py +++ /dev/null @@ -1,41 +0,0 @@ -from colossalai.context import ParallelMode -from colossalai.nn.layer import WrappedDropout as Dropout - - -def moe_sa_args(d_model: int, - n_heads: int, - d_kv: int, - attention_drop: float = 0, - drop_rate: float = 0, - bias: bool = True): - """This is an example for args in moe self attention, since lots of modules should be - adapted before putting them in experts. - """ - dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR) - dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR) - return dict( - d_model=d_model, - n_heads=n_heads, - d_kv=d_kv, - bias=bias, - dropout1=dropout1, - dropout2=dropout2 - ) - - -def moe_mlp_args(d_model: int, - d_ff: int, - drop_rate: float, - bias: bool = True): - """This is an example for args of MLP in Experts, since lots of modules should be adapted - before putting them in experts. - """ - dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR) - dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR) - return dict( - d_model=d_model, - d_ff=d_ff, - bias=bias, - dropout1=dropout1, - dropout2=dropout2 - ) diff --git a/model_zoo/vit/__init__.py b/model_zoo/vit/__init__.py deleted file mode 100644 index 5e5f1941d..000000000 --- a/model_zoo/vit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .vit import * \ No newline at end of file diff --git a/model_zoo/vit/vision_transformer_from_config.py b/model_zoo/vit/vision_transformer_from_config.py deleted file mode 100644 index af1e32091..000000000 --- a/model_zoo/vit/vision_transformer_from_config.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch - -from colossalai.registry import MODELS -from colossalai.nn.model.model_from_config import ModelFromConfig - - -@MODELS.register_module -class VisionTransformerFromConfig(ModelFromConfig): - """Vision Transformer from - `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. - - """ - - def __init__(self, - embedding_cfg: dict, - norm_cfg: dict, - block_cfg: dict, - head_cfg: dict, - token_fusion_cfg: dict = None, - embed_dim=768, - depth=12, - drop_path_rate=0., - tensor_splitting_cfg: dict = None): - super().__init__() - self.embed_dim = embed_dim - self.num_tokens = 1 - self.tensor_splitting_cfg = tensor_splitting_cfg - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) - ] # stochastic depth decay rule - if token_fusion_cfg is None: - token_fusion_cfg = [] - else: - token_fusion_cfg = [token_fusion_cfg] - - self.layers_cfg = [ - embedding_cfg, - - # input tensor splitting - *self._generate_tensor_splitting_cfg(), - *token_fusion_cfg, - - # blocks - *self._generate_block_cfg( - dpr=dpr, block_cfg=block_cfg, depth=depth), - - # norm - norm_cfg, - - # head - head_cfg - ] - - def _fuse_tokens(self, x): - cls_token = self.cls_token.expand(x.shape[0], -1, -1) - x = torch.cat((cls_token, x), dim=1) - return x - - def _generate_block_cfg(self, dpr, depth, block_cfg): - blocks_cfg = [] - - for i in range(depth): - _cfg = block_cfg.copy() - _cfg['droppath_cfg']['drop_path'] = dpr[i] - blocks_cfg.append(_cfg) - - return blocks_cfg - - def _generate_tensor_splitting_cfg(self): - if self.tensor_splitting_cfg: - return [self.tensor_splitting_cfg] - else: - return [] - - def forward(self, x): # [512, 3, 32, 32] - for layer in self.layers: - if isinstance(x, tuple): - x = layer(*x) - else: - x = layer(x) - return x # [256, 5] - - def init_weights(self): - # TODO: add init weights - pass diff --git a/model_zoo/vit/vit.py b/model_zoo/vit/vit.py deleted file mode 100644 index 9bdcbfd38..000000000 --- a/model_zoo/vit/vit.py +++ /dev/null @@ -1,415 +0,0 @@ -import math -from typing import Callable - -import torch -from colossalai import nn as col_nn -from colossalai.nn.layer.utils import CheckpointModule -from colossalai.registry import LAYERS, MODELS -from torch import dtype, nn - -__all__ = [ - 'VisionTransformer', - 'vit_lite_depth7_patch4_32', - 'vit_tiny_patch4_32', - 'vit_tiny_patch16_224', - 'vit_tiny_patch16_384', - 'vit_small_patch16_224', - 'vit_small_patch16_384', - 'vit_small_patch32_224', - 'vit_small_patch32_384', - 'vit_base_patch16_224', - 'vit_base_patch16_384', - 'vit_base_patch32_224', - 'vit_base_patch32_384', - 'vit_large_patch16_224', - 'vit_large_patch16_384', - 'vit_large_patch32_224', - 'vit_large_patch32_384', -] - -_init_rules = dict( - torch=dict( - embed=dict( - weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)), - bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1), - position_embed_initializer=col_nn.init.zeros_(), - ), - transformer=dict( - weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)), - bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1), - ), - head=dict( - weight_initializer=col_nn.init.kaiming_uniform_(a=math.sqrt(5)), - bias_initializer=col_nn.init.xavier_uniform_(a=1, scale=1), - ), - ), - jax=dict( - embed=dict( - weight_initializer=col_nn.init.lecun_normal_(), - bias_initializer=col_nn.init.zeros_(), - position_embed_initializer=col_nn.init.trunc_normal_(std=.02), - ), - transformer=dict( - weight_initializer=col_nn.init.xavier_uniform_(), - bias_initializer=col_nn.init.normal_(std=1e-6), - ), - head=dict( - weight_initializer=col_nn.init.zeros_(), - bias_initializer=col_nn.init.zeros_(), - ), - ), -) - - -@LAYERS.register_module -class ViTEmbedding(nn.Module): - def __init__(self, - img_size: int, - patch_size: int, - in_chans: int, - embedding_dim: int, - dropout: float, - dtype: dtype = None, - flatten: bool = True, - init_method: str = 'torch'): - super().__init__() - self.patch_embed = col_nn.PatchEmbedding(img_size, - patch_size, - in_chans, - embedding_dim, - dtype=dtype, - flatten=flatten, - **_init_rules[init_method]['embed']) - self.dropout = col_nn.Dropout(dropout) - - def forward(self, x): - x = self.patch_embed(x) - x = self.dropout(x) - return x - - -@LAYERS.register_module -class ViTSelfAttention(nn.Module): - def __init__(self, - dim: int, - num_heads: int, - attention_dropout: float, - dropout: float, - bias: bool = True, - dtype: dtype = None, - init_method: str = 'torch'): - super().__init__() - self.attention_head_size = dim // num_heads - self.query_key_value = col_nn.Linear(dim, - 3 * dim, - dtype=dtype, - bias=bias, - **_init_rules[init_method]['transformer']) - self.attention_dropout = col_nn.Dropout(attention_dropout) - self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True, **_init_rules[init_method]['transformer']) - self.dropout = col_nn.Dropout(dropout) - self.softmax = nn.Softmax(dim=-1) - - def forward(self, x): - qkv = self.query_key_value(x) - all_head_size = qkv.shape[-1] // 3 - num_attention_heads = all_head_size // self.attention_head_size - new_qkv_shape = qkv.shape[:-1] + \ - (num_attention_heads, 3 * self.attention_head_size) - qkv = qkv.view(new_qkv_shape) - qkv = qkv.permute((0, 2, 1, 3)) - q, k, v = torch.chunk(qkv, 3, dim=-1) - - x = torch.matmul(q, k.transpose(-1, -2)) - x = x / math.sqrt(self.attention_head_size) - x = self.softmax(x) - x = self.attention_dropout(x) - - x = torch.matmul(x, v) - x = x.transpose(1, 2) - new_context_layer_shape = x.size()[:-2] + (all_head_size, ) - x = x.reshape(new_context_layer_shape) - - x = self.dense(x) - x = self.dropout(x) - - return x - - -@LAYERS.register_module -class ViTMLP(nn.Module): - def __init__(self, - dim: int, - mlp_ratio: int, - activation: Callable, - dropout: float, - dtype: dtype = None, - bias: bool = True, - init_method: str = 'torch'): - super().__init__() - self.dense_1 = col_nn.Linear(dim, - mlp_ratio * dim, - dtype=dtype, - bias=bias, - **_init_rules[init_method]['transformer']) - self.activation = activation - self.dropout_1 = col_nn.Dropout(dropout) - self.dense_2 = col_nn.Linear(mlp_ratio * dim, - dim, - dtype=dtype, - bias=bias, - **_init_rules[init_method]['transformer']) - self.dropout_2 = col_nn.Dropout(dropout) - - def forward(self, x): - x = self.dense_1(x) - x = self.activation(x) - x = self.dropout_1(x) - x = self.dense_2(x) - x = self.dropout_2(x) - return x - - -@LAYERS.register_module -class ViTHead(nn.Module): - def __init__(self, - dim: int, - num_classes: int, - representation_size: int = None, - dtype: dtype = None, - bias: bool = True, - init_method: str = 'torch'): - super().__init__() - if representation_size: - self.representation = col_nn.Linear(dim, - representation_size, - bias=bias, - dtype=dtype, - **_init_rules[init_method]['head']) - else: - self.representation = None - representation_size = dim - - self.dense = col_nn.Classifier(representation_size, - num_classes, - dtype=dtype, - bias=bias, - **_init_rules[init_method]['head']) - - def forward(self, x): - x = x[:, 0] - if self.representation is not None: - x = self.representation(x) - x = self.dense(x) - return x - - -@LAYERS.register_module -class ViTBlock(CheckpointModule): - def __init__(self, - dim: int, - num_heads: int, - mlp_ratio: int, - activation: Callable, - attention_dropout: float = 0., - dropout: float = 0., - drop_path: float = 0., - layernorm_epsilon: float = 1e-6, - dtype: dtype = None, - bias: bool = True, - checkpoint: bool = False, - init_method: str = 'torch'): - super().__init__(checkpoint) - self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - self.attn = ViTSelfAttention(dim=dim, - num_heads=num_heads, - attention_dropout=attention_dropout, - dropout=dropout, - bias=bias, - dtype=dtype, - init_method=init_method) - self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity() - self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - self.mlp = ViTMLP(dim=dim, - mlp_ratio=mlp_ratio, - activation=activation, - dropout=dropout, - dtype=dtype, - bias=bias, - init_method=init_method) - - def _forward(self, x): - x = x + self.drop_path(self.attn(self.norm1(x))) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - - -@MODELS.register_module -class VisionTransformer(nn.Module): - def __init__(self, - img_size: int = 224, - patch_size: int = 16, - in_chans: int = 3, - num_classes: int = 1000, - depth: int = 12, - num_heads: int = 12, - dim: int = 768, - mlp_ratio: int = 4, - attention_dropout: float = 0., - dropout: float = 0.1, - drop_path: float = 0., - layernorm_epsilon: float = 1e-6, - activation: Callable = nn.functional.gelu, - representation_size: int = None, - dtype: dtype = None, - bias: bool = True, - checkpoint: bool = False, - init_method: str = 'torch'): - super().__init__() - - embed = ViTEmbedding(img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embedding_dim=dim, - dropout=dropout, - dtype=dtype, - init_method=init_method) - - # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, drop_path, depth)] - blocks = [ - ViTBlock( - dim=dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - attention_dropout=attention_dropout, - dropout=dropout, - drop_path=dpr[i], - activation=activation, - dtype=dtype, - bias=bias, - checkpoint=checkpoint, - init_method=init_method, - ) for i in range(depth) - ] - - norm = col_nn.LayerNorm(normalized_shape=dim, eps=layernorm_epsilon, dtype=dtype) - - head = ViTHead(dim=dim, - num_classes=num_classes, - representation_size=representation_size, - dtype=dtype, - bias=bias, - init_method=init_method) - - self.layers = nn.Sequential( - embed, - *blocks, - norm, - head, - ) - - def forward(self, x): - x = self.layers(x) - return x - - -def _create_vit_model(**model_kwargs): - model = VisionTransformer(**model_kwargs) - return model - - -@MODELS.register_module -def vit_lite_depth7_patch4_32(**kwargs): - model_kwargs = dict(img_size=32, patch_size=4, dim=256, depth=7, num_heads=4, mlp_ratio=2, num_classes=10, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_tiny_patch4_32(**kwargs): - model_kwargs = dict(img_size=32, patch_size=4, dim=512, depth=6, num_heads=8, mlp_ratio=1, num_classes=10, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_tiny_patch16_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_tiny_patch16_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=16, dim=192, depth=12, num_heads=3, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_small_patch16_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_small_patch16_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=16, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_small_patch32_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_small_patch32_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=32, dim=384, depth=12, num_heads=6, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_base_patch16_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_base_patch16_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=16, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_base_patch32_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_base_patch32_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=32, dim=768, depth=12, num_heads=12, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_large_patch16_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_large_patch16_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=16, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_large_patch32_224(**kwargs): - model_kwargs = dict(img_size=224, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) - - -@MODELS.register_module -def vit_large_patch32_384(**kwargs): - model_kwargs = dict(img_size=384, patch_size=32, dim=1024, depth=24, num_heads=16, mlp_ratio=4, **kwargs) - return _create_vit_model(**model_kwargs) diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 82e9c3c66..03101d69f 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,3 +1,4 @@ pytest torchvision transformers +titans diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py index 965d722b4..69fd1ee2d 100644 --- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py +++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py @@ -1,4 +1,5 @@ import os + from functools import partial from pathlib import Path @@ -6,19 +7,21 @@ import colossalai import pytest import torch import torch.multiprocessing as mp -from colossalai.amp.amp_type import AMP_TYPE -from colossalai.builder import build_pipeline_model -from colossalai.engine.schedule import PipelineSchedule -from colossalai.logging import get_dist_logger -from colossalai.nn import LinearWarmupLR -from colossalai.nn.loss import CrossEntropyLoss +from colossalai.amp import AMP_TYPE from colossalai.trainer import Trainer, hooks -from colossalai.utils import free_port, get_dataloader -from colossalai.engine.gradient_accumulation import GradAccumLrSchedulerByStep +from colossalai.context import ParallelMode from colossalai.testing import rerun_if_address_is_in_use -from model_zoo.vit import vit_tiny_patch4_32 -from torchvision import transforms -from torchvision.datasets import CIFAR10 +from colossalai.utils import free_port +from colossalai.core import global_context as gpc +from colossalai.logging import get_dist_logger +from colossalai.nn import CrossEntropyLoss +from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR +from colossalai.utils import is_using_pp, get_dataloader +from colossalai.utils.model.pipelinable import PipelinableContext +from tqdm import tqdm + +from titans.dataloader.cifar10 import build_cifar +from titans.model.vit import vit_tiny_patch4_32 BATCH_SIZE = 4 NUM_EPOCHS = 60 @@ -34,35 +37,35 @@ def run_trainer(rank, world_size, port): logger = get_dist_logger() - model = vit_tiny_patch4_32() - pipe_model = build_pipeline_model(model.layers, num_chunks=1) + # get logger + logger = get_dist_logger() - # build dataloaders - transform_train = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) + pipelinable = PipelinableContext() + with pipelinable: + model = vit_tiny_patch4_32() + pipelinable.to_layer_list() + pipelinable.load_policy("uniform") + model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE)) - train_dataset = CIFAR10(root=Path(os.environ['DATA']), train=True, download=True, transform=transform_train) - train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True) + # craete dataloaders + root = Path(os.environ['DATA']) + train_dataloader, test_dataloader = build_cifar(BATCH_SIZE, root, pad_if_needed=True, crop=32, resize=32) - # build criterion - criterion = CrossEntropyLoss() + # create loss function + criterion = CrossEntropyLoss(label_smoothing=0.1) - # optimizer - optimizer = torch.optim.Adam(pipe_model.parameters(), lr=0.001, weight_decay=0) + # create optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0) - # lr_scheduler - steps_per_epoch = GradAccumLrSchedulerByStep.compute_effective_steps_per_epoch(train_dataloader, accumulate_size=2) - total_steps = steps_per_epoch * NUM_EPOCHS - warmup_steps = steps_per_epoch * WARMUP_EPOCHS - lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps) + # create lr scheduler + lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS) - engine, train_dataloader, _, lr_scheduler = colossalai.initialize(pipe_model, - optimizer, - criterion, - train_dataloader, - lr_scheduler=lr_scheduler) + # intiailize + engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader, + test_dataloader=test_dataloader) logger = get_dist_logger()