[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
2023-09-11 16:24:28 +08:00 · 2023-09-11 16:24:28 +08:00 · 554aa9592e
parent 536397cc95
commit 554aa9592e
170 changed files with 781 additions and 758 deletions
--- a/colossalai/auto_parallel/offload/base_offload_module.py
+++ b/colossalai/auto_parallel/offload/base_offload_module.py
@ -4,7 +4,7 @@ from typing import Optional, Set
 import torch
 import torch.nn as nn
-from colossalai.nn.parallel.data_parallel import _cast_float
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.tensor_utils import free_storage
 from .region_manager import RegionManager
--- a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
@ -1,5 +1,4 @@
 class Registry:
    # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
    def __init__(self, name):
        self.name = name
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple
 import torch
 import torch.nn as nn
 from torch.optim import Optimizer
 from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
 from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.nn.optimizer import ColossalaiOptimizer
@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
        checkpoint_path (str): Path to the checkpoint directory.
        is_master (bool): Whether current rank is main process.
    """
    try:
        from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
        from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
    except ImportError:
        return
    if not isinstance(model, PreTrainedModel):
        return
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
@ -1,6 +1,6 @@
 import torch
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 class MLP(torch.nn.Module):
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@ -1,6 +1,6 @@
 import torch
-from colossalai.nn.layer.colossalai_layer import Embedding, Linear
+from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 from colossalai.utils import get_current_device
 from .bias_dropout_add import bias_dropout_add_fused_train
--- a/colossalai/legacy/communication/init.py
+++ b/colossalai/legacy/communication/init.py
@ -1,9 +1,17 @@
-from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
+from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter
-from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
+from .p2p import (
-                  send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
+    recv_backward,
-                  recv_forward, recv_backward)
+    recv_forward,
    send_backward,
    send_backward_recv_backward,
    send_backward_recv_forward,
    send_forward,
    send_forward_backward_recv_forward_backward,
    send_forward_recv_backward,
    send_forward_recv_forward,
 )
 from .ring import ring_forward
-from .utils import send_obj_meta, recv_obj_meta
+from .utils import recv_obj_meta, send_obj_meta
 __all__ = [
    'all_gather',
--- a/colossalai/legacy/communication/collective.py
+++ b/colossalai/legacy/communication/collective.py
--- a/colossalai/legacy/communication/p2p.py
+++ b/colossalai/legacy/communication/p2p.py
--- a/colossalai/legacy/communication/p2p_v2.py
+++ b/colossalai/legacy/communication/p2p_v2.py
--- a/colossalai/legacy/communication/ring.py
+++ b/colossalai/legacy/communication/ring.py
--- a/colossalai/legacy/communication/utils.py
+++ b/colossalai/legacy/communication/utils.py
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union
 import torch.cuda
-import colossalai.communication as comm
+import colossalai.legacy.communication as comm
 from colossalai.amp.naive_amp import NaiveAMPModel
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
@ -5,10 +5,10 @@ from typing import Iterable, Tuple
 import torch.cuda
-import colossalai.communication.p2p_v2 as comm
+import colossalai.legacy.communication.p2p_v2 as comm
 from colossalai import engine
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.engine import Engine
 from colossalai.utils.cuda import get_current_device
 from ._pipeline_schedule import PipelineSchedule
@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule):
    """
    def forward_backward_step(self,
-                              engine: engine.Engine,
+                              engine: Engine,
                              data_iter: Iterable,
                              forward_only=False,
                              return_loss=True,
--- a/colossalai/legacy/nn/init.py
+++ b/colossalai/legacy/nn/init.py
@ -0,0 +1,4 @@
 from ._ops import *
 from .layer import *
 from .loss import *
 from .metric import *
--- a/colossalai/legacy/nn/_ops/init.py
+++ b/colossalai/legacy/nn/_ops/init.py
--- a/colossalai/legacy/nn/_ops/_utils.py
+++ b/colossalai/legacy/nn/_ops/_utils.py
@ -4,7 +4,7 @@ import torch
 import torch.distributed as dist
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import divide
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
 GeneralTensor = Union[ColoTensor, torch.Tensor]
@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
    return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
-### table wise embedding shard
+# table wise embedding shard
 def _all_to_all_for_tablewise(x: torch.Tensor,
--- a/colossalai/legacy/nn/_ops/addmm.py
+++ b/colossalai/legacy/nn/_ops/addmm.py
--- a/colossalai/legacy/nn/_ops/batch_norm.py
+++ b/colossalai/legacy/nn/_ops/batch_norm.py
--- a/colossalai/legacy/nn/_ops/element_wise.py
+++ b/colossalai/legacy/nn/_ops/element_wise.py
--- a/colossalai/legacy/nn/_ops/embedding.py
+++ b/colossalai/legacy/nn/_ops/embedding.py
@ -1,8 +1,10 @@
 import torch.nn.functional as F
 from typing import Optional
 import torch.nn.functional as F
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
+
    ReplicaSpec
 from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
--- a/colossalai/legacy/nn/_ops/embedding_bag.py
+++ b/colossalai/legacy/nn/_ops/embedding_bag.py
@ -1,9 +1,11 @@
 import torch.nn.functional as F
 from typing import Optional
 import torch.nn.functional as F
 from torch import Tensor
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
+
    ShardSpec, ReplicaSpec
 from ._utils import GeneralTensor, convert_to_colo_tensor
--- a/colossalai/legacy/nn/_ops/layernorm.py
+++ b/colossalai/legacy/nn/_ops/layernorm.py
@ -1,7 +1,10 @@
 from typing import List, Optional
 import torch.nn.functional as F
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
--- a/colossalai/legacy/nn/_ops/linear.py
+++ b/colossalai/legacy/nn/_ops/linear.py
--- a/colossalai/legacy/nn/_ops/loss.py
+++ b/colossalai/legacy/nn/_ops/loss.py
@ -1,9 +1,12 @@
 from typing import Optional
 import torch
 import torch.nn.functional as F
-from typing import Optional
+
-from colossalai.tensor.op_wrapper import colo_op_impl
+from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
 from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
+from colossalai.tensor.op_wrapper import colo_op_impl
 from ._utils import GeneralTensor, convert_to_colo_tensor
--- a/colossalai/legacy/nn/_ops/view.py
+++ b/colossalai/legacy/nn/_ops/view.py
--- a/colossalai/legacy/nn/layer/init.py
+++ b/colossalai/legacy/nn/layer/init.py
@ -0,0 +1,9 @@
 from .colossalai_layer import *
 from .parallel_1d import *
 from .parallel_2d import *
 from .parallel_2p5d import *
 from .parallel_3d import *
 from .parallel_sequence import *
 from .utils import *
 from .vanilla import *
 from .wrapper import *
--- a/colossalai/legacy/nn/layer/base_layer.py
+++ b/colossalai/legacy/nn/layer/base_layer.py
--- a/colossalai/legacy/nn/layer/colossalai_layer/init.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/init.py
@ -1,7 +1,7 @@
-from ._utils import partition_batch
+from ._utils import partition_batch
-from .dropout import Dropout
+from .dropout import Dropout
-from .embedding import Embedding, PatchEmbedding
+from .embedding import Embedding, PatchEmbedding
-from .linear import Classifier, Linear
+from .linear import Classifier, Linear
-from .normalization import LayerNorm
+from .normalization import LayerNorm
-
+
-__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
+__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
--- a/colossalai/legacy/nn/layer/colossalai_layer/_utils.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/_utils.py
--- a/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
--- a/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
@ -1,151 +1,152 @@
-import math
+import math
-from typing import Callable
+from typing import Callable
-
+
-from colossalai.utils import get_current_device
+from torch import dtype, nn
-from torch import dtype, nn
+
-
+from colossalai.nn import init
-from ... import init as init
+from colossalai.utils import get_current_device
-from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
+
-from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
+from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
-from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
+from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
-from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
+from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
-from ..utils import get_tensor_parallel_mode
+from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
-from ..vanilla import VanillaPatchEmbedding
+from ..utils import get_tensor_parallel_mode
-from ._utils import ColossalaiModule
+from ..vanilla import VanillaPatchEmbedding
-
+from ._utils import ColossalaiModule
-_parallel_embedding = {
+
-    '1d': Embedding1D,
+_parallel_embedding = {
-    '2d': Embedding2D,
+    '1d': Embedding1D,
-    '2.5d': Embedding2p5D,
+    '2d': Embedding2D,
-    '3d': Embedding3D,
+    '2.5d': Embedding2p5D,
-}
+    '3d': Embedding3D,
-
+}
-_vocab_parallel_embedding = {
+
-    '1d': VocabParallelEmbedding1D,
+_vocab_parallel_embedding = {
-    '2d': VocabParallelEmbedding2D,
+    '1d': VocabParallelEmbedding1D,
-    '2.5d': VocabParallelEmbedding2p5D,
+    '2d': VocabParallelEmbedding2D,
-    '3d': VocabParallelEmbedding3D
+    '2.5d': VocabParallelEmbedding2p5D,
-}
+    '3d': VocabParallelEmbedding3D
-
+}
-_parallel_patchembedding = {
+
-    None: VanillaPatchEmbedding,
+_parallel_patchembedding = {
-    '1d': PatchEmbedding1D,
+    None: VanillaPatchEmbedding,
-    '2d': PatchEmbedding2D,
+    '1d': PatchEmbedding1D,
-    '2.5d': PatchEmbedding2p5D,
+    '2d': PatchEmbedding2D,
-    '3d': PatchEmbedding3D
+    '2.5d': PatchEmbedding2p5D,
-}
+    '3d': PatchEmbedding3D
-
+}
-
+
-class Embedding(ColossalaiModule):
+
-    r"""Embedding for colossalai.
+class Embedding(ColossalaiModule):
-
+    r"""Embedding for colossalai.
-    Args:
+
-        num_embeddings (int): number of embeddings.
+    Args:
-        embedding_dim (int): dimension of embedding.
+        num_embeddings (int): number of embeddings.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+        embedding_dim (int): dimension of embedding.
-            therefore, the embedding vector at padding_idx is not updated during training,
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            i.e. it remains as a fixed “pad”, defaults to None.
+            therefore, the embedding vector at padding_idx is not updated during training,
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+            i.e. it remains as a fixed “pad”, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-            he initializer of weight, defaults to normal initializer.
+        weight_initializer (:class:`typing.Callable`, optional):
-
+            he initializer of weight, defaults to normal initializer.
-    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+
-    ::
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-
+    ::
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-                    of frequency of the words in the mini-batch. Default False.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+                    of frequency of the words in the mini-batch. Default False.
-
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-    More details about ``args`` and ``kwargs`` could be found in
+
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+    More details about ``args`` and ``kwargs`` could be found in
-
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-    More details about ``initializer`` please refer to
+
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    More details about ``initializer`` please refer to
-    """
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
-
+    """
-    def __init__(self,
+
-                 num_embeddings: int,
+    def __init__(self,
-                 embedding_dim: int,
+                 num_embeddings: int,
-                 padding_idx: int = None,
+                 embedding_dim: int,
-                 dtype: dtype = None,
+                 padding_idx: int = None,
-                 weight_initializer: Callable = init.normal_(),
+                 dtype: dtype = None,
-                 vocab_parallel_limit: int = 2048,
+                 weight_initializer: Callable = init.normal_(),
-                 *args,
+                 vocab_parallel_limit: int = 2048,
-                 **kwargs) -> None:
+                 *args,
-        tensor_parallel = get_tensor_parallel_mode()
+                 **kwargs) -> None:
-        if tensor_parallel is None:
+        tensor_parallel = get_tensor_parallel_mode()
-            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
+        if tensor_parallel is None:
-                                 **kwargs).to(dtype).to(get_current_device())
+            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
-            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
+                                 **kwargs).to(dtype).to(get_current_device())
-        elif num_embeddings <= vocab_parallel_limit:
+            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
-            embed = _parallel_embedding[tensor_parallel](
+        elif num_embeddings <= vocab_parallel_limit:
-                num_embeddings,
+            embed = _parallel_embedding[tensor_parallel](
-                embedding_dim,
+                num_embeddings,
-                padding_idx=padding_idx,
+                embedding_dim,
-                dtype=dtype,
+                padding_idx=padding_idx,
-                weight_initializer=weight_initializer,
+                dtype=dtype,
-                *args,
+                weight_initializer=weight_initializer,
-                **kwargs,
+                *args,
-            )
+                **kwargs,
-        else:
+            )
-            embed = _vocab_parallel_embedding[tensor_parallel](
+        else:
-                num_embeddings,
+            embed = _vocab_parallel_embedding[tensor_parallel](
-                embedding_dim,
+                num_embeddings,
-                padding_idx=padding_idx,
+                embedding_dim,
-                dtype=dtype,
+                padding_idx=padding_idx,
-                weight_initializer=weight_initializer,
+                dtype=dtype,
-                *args,
+                weight_initializer=weight_initializer,
-                **kwargs,
+                *args,
-            )
+                **kwargs,
-        super().__init__(embed)
+            )
-
+        super().__init__(embed)
-
+
-class PatchEmbedding(ColossalaiModule):
+
-    """2D Image to Patch Embedding.
+class PatchEmbedding(ColossalaiModule):
-
+    """2D Image to Patch Embedding.
-    Args:
+
-        img_size (int): image size.
+    Args:
-        patch_size (int): patch size.
+        img_size (int): image size.
-        in_chans (int): number of channels of input image.
+        patch_size (int): patch size.
-        embed_size (int): size of embedding.
+        in_chans (int): number of channels of input image.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        embed_size (int): size of embedding.
-        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-            The initializer of weight, defaults to kaiming uniform initializer.
+        weight_initializer (:class:`typing.Callable`, optional):
-        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
-            The initializer of bias, defaults to xavier uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
-        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
-            The initializer of position embedding, defaults to zeros initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
-
+            The initializer of position embedding, defaults to zeros initializer.
-    More details about ``initializer`` please refer to
+
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    More details about ``initializer`` please refer to
-    """
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-
+    """
-    def __init__(
+
-        self,
+    def __init__(
-        img_size: int,
+        self,
-        patch_size: int,
+        img_size: int,
-        in_chans: int,
+        patch_size: int,
-        embed_size: int,
+        in_chans: int,
-        dtype: dtype = None,
+        embed_size: int,
-        flatten: bool = True,
+        dtype: dtype = None,
-        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        flatten: bool = True,
-        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-        position_embed_initializer: Callable = init.zeros_()
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-    ) -> None:
+        position_embed_initializer: Callable = init.zeros_()
-        tensor_parallel = get_tensor_parallel_mode()
+    ) -> None:
-        embed = _parallel_patchembedding[tensor_parallel](
+        tensor_parallel = get_tensor_parallel_mode()
-            img_size,
+        embed = _parallel_patchembedding[tensor_parallel](
-            patch_size,
+            img_size,
-            in_chans,
+            patch_size,
-            embed_size,
+            in_chans,
-            dtype=dtype,
+            embed_size,
-            flatten=flatten,
+            dtype=dtype,
-            weight_initializer=weight_initializer,
+            flatten=flatten,
-            bias_initializer=bias_initializer,
+            weight_initializer=weight_initializer,
-            position_embed_initializer=position_embed_initializer,
+            bias_initializer=bias_initializer,
-        )
+            position_embed_initializer=position_embed_initializer,
-        super().__init__(embed)
+        )
        super().__init__(embed)
--- a/colossalai/legacy/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/linear.py
@ -4,9 +4,9 @@ from typing import Callable
 from torch import dtype, nn
 from colossalai.nn import init
 from colossalai.utils import get_current_device
 from ... import init as init
 from ..parallel_1d import *
 from ..parallel_2d import *
 from ..parallel_2p5d import *
--- a/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
@ -1,41 +1,42 @@
-from colossalai.utils import get_current_device
+from torch import nn
-from torch import nn
+
-
+from colossalai.utils import get_current_device
-from ..parallel_1d import LayerNorm1D
+
-from ..parallel_2d import LayerNorm2D
+from ..parallel_1d import LayerNorm1D
-from ..parallel_2p5d import LayerNorm2p5D
+from ..parallel_2d import LayerNorm2D
-from ..parallel_3d import LayerNorm3D
+from ..parallel_2p5d import LayerNorm2p5D
-from ..utils import get_tensor_parallel_mode
+from ..parallel_3d import LayerNorm3D
-from ..vanilla import VanillaLayerNorm
+from ..utils import get_tensor_parallel_mode
-from ._utils import ColossalaiModule
+from ..vanilla import VanillaLayerNorm
-
+from ._utils import ColossalaiModule
-_parallel_layernorm = {
+
-    None: VanillaLayerNorm,
+_parallel_layernorm = {
-    "1d": LayerNorm1D,
+    None: VanillaLayerNorm,
-    "2d": LayerNorm2D,
+    "1d": LayerNorm1D,
-    "2.5d": LayerNorm2p5D,
+    "2d": LayerNorm2D,
-    "3d": LayerNorm3D,
+    "2.5d": LayerNorm2p5D,
-}
+    "3d": LayerNorm3D,
-
+}
-
+
-class LayerNorm(ColossalaiModule):
+
-    r"""Layer Normalization for colossalai.
+class LayerNorm(ColossalaiModule):
-
+    r"""Layer Normalization for colossalai.
-    Args:
+
-        normalized_shape (int): input shape from an expected input of size.
+    Args:
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+        normalized_shape (int): input shape from an expected input of size.
-            \times \ldots \times \text{normalized_shape}[-1]]`
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-            If a single integer is used, it is treated as a singleton list, and this module will
+            \times \ldots \times \text{normalized_shape}[-1]]`
-            normalize over the last dimension which is expected to be of that specific size.
+            If a single integer is used, it is treated as a singleton list, and this module will
-        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
+            normalize over the last dimension which is expected to be of that specific size.
-        bias (bool, optional): Whether to add a bias, defaults to ``True``.
+        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        bias (bool, optional): Whether to add a bias, defaults to ``True``.
-    """
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-
+    """
-    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
+
-        tensor_parallel = get_tensor_parallel_mode()
+    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
-        if tensor_parallel is None:
+        tensor_parallel = get_tensor_parallel_mode()
-            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
+        if tensor_parallel is None:
-        else:
+            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
-            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+        else:
-        super().__init__(norm)
+            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
        super().__init__(norm)
--- a/colossalai/legacy/nn/layer/parallel_1d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/init.py
@ -0,0 +1,17 @@
 from .layers import (
    Classifier1D,
    Dropout1D,
    Embedding1D,
    LayerNorm1D,
    Linear1D,
    Linear1D_Col,
    Linear1D_Row,
    PatchEmbedding1D,
    VocabParallelClassifier1D,
    VocabParallelEmbedding1D,
 )
 __all__ = [
    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
 ]
--- a/colossalai/legacy/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
--- a/colossalai/legacy/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
@ -3,6 +3,7 @@
 import torch
 import torch.distributed as dist
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
 class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
-    
+
    Args:
        input_: input matrix.
        parallel_mode: parallel mode.
--- a/colossalai/legacy/nn/layer/parallel_1d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/layers.py
@ -10,11 +10,11 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn.parameter import Parameter
 from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
 from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (
--- a/colossalai/legacy/nn/layer/parallel_2d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/init.py
@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2d, split_batch_2d
-from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
+from .layers import (
-                     VocabParallelEmbedding2D)
+    Classifier2D,
    Embedding2D,
    LayerNorm2D,
    Linear2D,
    PatchEmbedding2D,
    VocabParallelClassifier2D,
    VocabParallelEmbedding2D,
 )
 __all__ = [
    'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
 import torch
 import torch.distributed as dist
 from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
 from colossalai.utils import get_current_device
 def matmul_2d(
@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)
        src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        opa = [None] * 2
        opb = [None] * 2
@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)
        src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        opb = [None] * 2
        opr = [None] * 2
@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)
        src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        opa = [None] * 2
        opr = [None] * 2
--- a/colossalai/legacy/nn/layer/parallel_2d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_2d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/layers.py
@ -8,10 +8,10 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
--- a/colossalai/legacy/nn/layer/parallel_2p5d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/init.py
@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
-from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
+from .layers import (
-                     VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
+    Classifier2p5D,
    Embedding2p5D,
    LayerNorm2p5D,
    Linear2p5D,
    PatchEmbedding2p5D,
    VocabParallelClassifier2p5D,
    VocabParallelEmbedding2p5D,
 )
 __all__ = [
    'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@ -2,13 +2,14 @@ from typing import Any, Tuple
 import torch
 import torch.distributed as dist
 from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
 from colossalai.utils import get_current_device
 def get_parallel_group(parallel_mode: ParallelMode):
    return gpc.get_group(parallel_mode)
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
@ -8,10 +8,10 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (
--- a/colossalai/legacy/nn/layer/parallel_3d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/init.py
@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
-from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
+from .layers import (
-                     VocabParallelEmbedding3D)
+    Classifier3D,
    Embedding3D,
    LayerNorm3D,
    Linear3D,
    PatchEmbedding3D,
    VocabParallelClassifier3D,
    VocabParallelEmbedding3D,
 )
 __all__ = [
    'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@ -7,10 +7,10 @@ import torch
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 from ._utils import get_parallel_mode_from_env, push_async_grad
--- a/colossalai/legacy/nn/layer/parallel_3d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_3d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py
@ -8,14 +8,14 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 from colossalai.communication import all_reduce, broadcast
 from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import all_reduce, broadcast
 from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.nn.layer.base_layer import ParallelLayer
 from colossalai.utils.checkpointing import (
    broadcast_state_dict,
    gather_tensor_parallel_state_dict,
--- a/colossalai/legacy/nn/layer/parallel_sequence/init.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/init.py
@ -1,4 +1,4 @@
-from ._operation import RingQK, RingAV
+from ._operation import RingAV, RingQK
 from .layers import TransformerSelfAttentionRing
 __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@ -3,13 +3,13 @@
 import torch
 from torch import distributed as dist
 from torch.cuda.amp import custom_bwd, custom_fwd
 from colossalai.communication import ring_forward
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range
+from colossalai.legacy.communication import ring_forward
 from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
 from colossalai.utils import get_current_device
 from torch.cuda.amp import custom_bwd, custom_fwd
 class RingQK(torch.autograd.Function):
--- a/colossalai/legacy/nn/layer/parallel_sequence/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.kernel import FusedScaleMaskSoftmax
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
 from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
@LAYERS.register_module
--- a/colossalai/legacy/nn/layer/utils/init.py
+++ b/colossalai/legacy/nn/layer/utils/init.py
@ -0,0 +1,15 @@
 from .common import (
    ACT2FN,
    CheckpointModule,
    _ntuple,
    divide,
    get_tensor_parallel_mode,
    set_tensor_parallel_attribute_by_partition,
    set_tensor_parallel_attribute_by_size,
    to_2tuple,
 )
 __all__ = [
    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
 ]
--- a/colossalai/legacy/nn/layer/utils/common.py
+++ b/colossalai/legacy/nn/layer/utils/common.py
@ -6,10 +6,11 @@ from itertools import repeat
 import numpy as np
 import torch
 from torch import Tensor, nn
 from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.utils import checkpoint
 from torch import Tensor, nn
 class CheckpointModule(nn.Module):
--- a/colossalai/legacy/nn/layer/vanilla/init.py
+++ b/colossalai/legacy/nn/layer/vanilla/init.py
--- a/colossalai/legacy/nn/layer/vanilla/layers.py
+++ b/colossalai/legacy/nn/layer/vanilla/layers.py
--- a/colossalai/legacy/nn/layer/wrapper/init.py
+++ b/colossalai/legacy/nn/layer/wrapper/init.py
--- a/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
@ -1,6 +1,8 @@
 import torch.nn as nn
 import torch.distributed as dist
 from typing import List, Tuple, Union
 import torch.distributed as dist
 import torch.nn as nn
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
--- a/colossalai/legacy/nn/loss/init.py
+++ b/colossalai/legacy/nn/loss/init.py
@ -0,0 +1,41 @@
 from torch import nn
 from torch.nn.modules.loss import *
 from torch.nn.modules.loss import _Loss
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
 from .loss_1d import VocabParallelCrossEntropyLoss1D
 from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
 from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
 from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
 _parallel_cross_entropy = {
    '2d': CrossEntropyLoss2D,
    '2.5d': CrossEntropyLoss2p5D,
    '3d': CrossEntropyLoss3D,
 }
 _vocab_parallel_cross_entropy = {
    '1d': VocabParallelCrossEntropyLoss1D,
    '2d': VocabParallelCrossEntropyLoss2D,
    '2.5d': VocabParallelCrossEntropyLoss2p5D,
    '3d': VocabParallelCrossEntropyLoss3D,
 }
 class CrossEntropyLoss(_Loss):
    def __init__(self, reduction: bool = True, *args, **kwargs):
        super().__init__()
        tensor_parallel = get_tensor_parallel_mode()
        if tensor_parallel is not None and env.vocab_parallel:
            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
        elif tensor_parallel is None or tensor_parallel == '1d':
            reduction = 'mean' if reduction else 'none'
            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
        else:
            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
    def forward(self, *args):
        return self.loss(*args)
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
 from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.utils import get_current_device
--- a/colossalai/legacy/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.legacy.registry import LOSSES
 from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.utils import get_current_device
--- a/colossalai/legacy/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
 from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.legacy.registry import LOSSES
 from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
 from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.utils import get_current_device
--- a/colossalai/legacy/nn/metric/init.py
+++ b/colossalai/legacy/nn/metric/init.py
@ -1,26 +1,28 @@
-from torch import nn
+from torch import nn
-
+
-from ._utils import calc_acc
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
-from .accuracy_2d import Accuracy2D
+
-from .accuracy_2p5d import Accuracy2p5D
+from ._utils import calc_acc
-from .accuracy_3d import Accuracy3D
+from .accuracy_2d import Accuracy2D
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
+from .accuracy_2p5d import Accuracy2p5D
-
+from .accuracy_3d import Accuracy3D
-_parallel_accuracy = {
+
-    '2d': Accuracy2D,
+_parallel_accuracy = {
-    '2.5d': Accuracy2p5D,
+    '2d': Accuracy2D,
-    '3d': Accuracy3D,
+    '2.5d': Accuracy2p5D,
-}
+    '3d': Accuracy3D,
-
+}
-
+
-class Accuracy(nn.Module):
+
-    def __init__(self):
+class Accuracy(nn.Module):
-        super().__init__()
+
-        tensor_parallel = get_tensor_parallel_mode()
+    def __init__(self):
-        if tensor_parallel not in _parallel_accuracy:
+        super().__init__()
-            self.acc = calc_acc
+        tensor_parallel = get_tensor_parallel_mode()
-        else:
+        if tensor_parallel not in _parallel_accuracy:
-            self.acc = _parallel_accuracy[tensor_parallel]()
+            self.acc = calc_acc
-
+        else:
-    def forward(self, *args):
+            self.acc = _parallel_accuracy[tensor_parallel]()
-        return self.acc(*args)
+
    def forward(self, *args):
        return self.acc(*args)
--- a/colossalai/legacy/nn/metric/_utils.py
+++ b/colossalai/legacy/nn/metric/_utils.py
@ -1,7 +1,7 @@
-import torch
+import torch
-
+
-
+
-def calc_acc(logits, targets):
+def calc_acc(logits, targets):
-    preds = torch.argmax(logits, dim=-1)
+    preds = torch.argmax(logits, dim=-1)
-    correct = torch.sum(targets == preds)
+    correct = torch.sum(targets == preds)
-    return correct
+    return correct
--- a/colossalai/legacy/nn/metric/accuracy_2d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2d.py
@ -1,7 +1,8 @@
 import torch
 from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from torch import nn
 from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from ._utils import calc_acc
--- a/colossalai/legacy/nn/metric/accuracy_2p5d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2p5d.py
@ -1,7 +1,8 @@
 import torch
 from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from torch import nn
 from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from ._utils import calc_acc
--- a/colossalai/legacy/nn/metric/accuracy_3d.py
+++ b/colossalai/legacy/nn/metric/accuracy_3d.py
@ -1,33 +1,35 @@
-import torch
+import torch
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from torch import nn
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from torch import nn
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
-from ._utils import calc_acc
+
-
+from ._utils import calc_acc
-
+
-class Accuracy3D(nn.Module):
+
-    """Accuracy for 3D parallelism
+class Accuracy3D(nn.Module):
-    """
+    """Accuracy for 3D parallelism
-    def __init__(self):
+    """
-        super().__init__()
+
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+    def __init__(self):
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+        super().__init__()
-
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-    def forward(self, logits, targets):
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        """Calculate the accuracy of predicted labels.
+
-
+    def forward(self, logits, targets):
-        Args:
+        """Calculate the accuracy of predicted labels.
-            logits (:class:`torch.tensor`): Predicted labels.
+
-            targets (:class:`torch.tensor`): True labels from data.
+        Args:
-
+            logits (:class:`torch.tensor`): Predicted labels.
-        Returns:
+            targets (:class:`torch.tensor`): True labels from data.
-            float: the accuracy of prediction.
+
-         """
+        Returns:
-        with torch.no_grad():
+            float: the accuracy of prediction.
-            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
+         """
-            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
+        with torch.no_grad():
-            correct = calc_acc(logits, targets)
+            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
-            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
+            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
-        return correct
+            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
        return correct
--- a/colossalai/legacy/nn/parallel/init.py
+++ b/colossalai/legacy/nn/parallel/init.py
--- a/colossalai/legacy/nn/parallel/data_parallel.py
+++ b/colossalai/legacy/nn/parallel/data_parallel.py
--- a/colossalai/legacy/nn/parallel/layers/init.py
+++ b/colossalai/legacy/nn/parallel/layers/init.py
@ -1,10 +1,17 @@
 from .cache_embedding import (
    CachedEmbeddingBag,
    CachedParamMgr,
    EvictionStrategy,
    LimitBuffIndexCopyer,
    ParallelCachedEmbeddingBag,
    ParallelCachedEmbeddingBagTablewise,
    ParallelCachedEmbeddingBagTablewiseSpiltCache,
    TablewiseEmbeddingBagConfig,
 )
 from .colo_module import ColoModule
 from .linear import ColoLinear
 from .embedding import ColoEmbedding
-from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
+from .linear import ColoLinear
-
+from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
 from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
    ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
 __all__ = [
    'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/init.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/init.py
@ -1,8 +1,8 @@
 from .cache_mgr import CachedParamMgr, EvictionStrategy
 from .copyer import LimitBuffIndexCopyer
 from .cached_embedding import CachedEmbeddingBag
-from .parallel_cached_embedding import ParallelCachedEmbeddingBag
+from .copyer import LimitBuffIndexCopyer
 from .embedding_config import TablewiseEmbeddingBagConfig
 from .parallel_cached_embedding import ParallelCachedEmbeddingBag
 from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
 from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
@ -1,4 +1,5 @@
 import abc
 import torch.nn as nn
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
@ -1,12 +1,14 @@
 import numpy as np
 import torch
 from torch.profiler import record_function
 from typing import List, Optional
 from contexttimer import Timer
 from .copyer import LimitBuffIndexCopyer
 from enum import Enum
 import sys
 from contextlib import contextmanager
 from enum import Enum
 from typing import List, Optional
 import numpy as np
 import torch
 from contexttimer import Timer
 from torch.profiler import record_function
 from .copyer import LimitBuffIndexCopyer
 class EvictionStrategy(Enum):
@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
 class CachedParamMgr(torch.nn.Module):
    """
    Manage Embedding Weights on CPU and CUDA memory uses a software cache.
-    CPU maintains the entire original weight. 
+    CPU maintains the entire original weight.
    CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
    During training, GPU needs to transmit embedding rows between CPU and GPU.
    Args:
@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
        self._elapsed_dict[name] += t.elapsed
    def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
-        """_find_evict_gpu_idxs 
+        """_find_evict_gpu_idxs
        Find the gpu idxs to be evicted, according to their freq.
        Args:
            evict_num (int): how many rows has to be evicted
@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
        """reorder
        reorder the weight according to ids' frequency in dataset before training.
        Execute only once before training, also known as warmup phase.
-        
+
        Note:
            If you would like to use the DATASET as the eviction strategy, you must call this function.
        Note:
@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
        """
        deprecated
        evict one row from cuda to cpu.
-        Returns: 
+        Returns:
        (int) : the slot id be evicted.
        """
        mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
@ -1,10 +1,11 @@
 from typing import Iterator, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple, Union
+from torch.nn.parameter import Parameter
 from .base_embedding import BaseEmbeddingBag
 from .cache_mgr import CachedParamMgr, EvictionStrategy
 from torch.nn.parameter import Parameter
 class CachedEmbeddingBag(BaseEmbeddingBag):
@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
        include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
        dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
        device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
-        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
+        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
        warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
        buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
                    buffer_size=50_000,
                    pin_weight=False):
        """
-        Called after initialized. 
+        Called after initialized.
        Reorder the weight rows according to the ids_freq_mapping.
        Then, let the weights of the Module be managed by a CachedParamMgr.
-        
+
        Args:
            cuda_row_num (int): number of rows can be hosted in CUDA memory
            ids_freq_mapping (List[int]): a list, idx is id number, value is freq
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
@ -3,7 +3,7 @@ from torch import LongTensor
 class LimitBuffIndexCopyer(object):
-    """LimitBuffIndexCopyer 
+    """LimitBuffIndexCopyer
    Index Copy using limited temp buffer on CUDA.
    Args:
@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
    @torch.no_grad()
    def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
-        """copy 
+        """copy
        src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
        The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
@ -1,12 +1,13 @@
 from typing import Iterator, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
 from typing import List, Optional, Iterator, Tuple
-from .cached_embedding import CachedEmbeddingBag
+from colossalai.legacy.nn._ops._utils import dual_all_to_all
-from colossalai.nn._ops._utils import dual_all_to_all
+from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
 from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
 from .cache_mgr import CachedParamMgr, EvictionStrategy
 from .cached_embedding import CachedEmbeddingBag
 def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
@ -1,15 +1,16 @@
 import time
 from typing import List
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from .cached_embedding import CachedEmbeddingBag
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from .cache_mgr import EvictionStrategy
 from .embedding_config import TablewiseEmbeddingBagConfig
 from colossalai.tensor import ProcessGroup
 from colossalai.nn._ops._utils import dual_all_to_all_tablewise
-from typing import List
+from .cache_mgr import EvictionStrategy
-import time
+from .cached_embedding import CachedEmbeddingBag
 from .embedding_config import TablewiseEmbeddingBagConfig
 class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
@ -1,17 +1,17 @@
 import abc
 from typing import List
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.profiler import record_function
-from .cached_embedding import CachedEmbeddingBag
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
 from colossalai.nn._ops._utils import dual_all_to_all_tablewise
 from .embedding_config import TablewiseEmbeddingBagConfig
 from .cache_mgr import EvictionStrategy
-from typing import List
+from .cache_mgr import EvictionStrategy
-import abc
+from .cached_embedding import CachedEmbeddingBag
 from .embedding_config import TablewiseEmbeddingBagConfig
 class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
--- a/colossalai/legacy/nn/parallel/layers/colo_module.py
+++ b/colossalai/legacy/nn/parallel/layers/colo_module.py
@ -1,6 +1,7 @@
-from colossalai.tensor.distspec import _DistSpec
+from typing import Dict, List
 from colossalai.tensor import ComputePattern
-from typing import List, Dict
+from colossalai.tensor.distspec import _DistSpec
 class ColoModule(object):
--- a/colossalai/legacy/nn/parallel/layers/embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/embedding.py
@ -1,5 +1,6 @@
 from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
 from .colo_module import ColoModule
 from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
 class ColoEmbedding(ColoModule):
--- a/colossalai/legacy/nn/parallel/layers/linear.py
+++ b/colossalai/legacy/nn/parallel/layers/linear.py
@ -1,5 +1,6 @@
 from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
 from .colo_module import ColoModule
 from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
 class ColoLinear(ColoModule):
--- a/colossalai/legacy/nn/parallel/layers/module_utils.py
+++ b/colossalai/legacy/nn/parallel/layers/module_utils.py
@ -1,9 +1,11 @@
 from typing import Dict
-from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
+
 from colossalai.tensor import distspec
 from . import ColoModule
 import torch
 from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
 from . import ColoModule
 _COLOSSAL_MODULES: Dict[type, ColoModule] = {}
--- a/colossalai/legacy/nn/parallel/reducer.py
+++ b/colossalai/legacy/nn/parallel/reducer.py
--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@ -7,9 +7,9 @@ from typing import Callable
 import torch
 import torch.distributed as dist
 from colossalai.communication import all_reduce
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import all_reduce
 from colossalai.legacy.registry import HOOKS
 from colossalai.utils import get_current_device, is_no_pp_or_last_stage
--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@ -6,8 +6,7 @@ import logging
 from pathlib import Path
 from typing import List, Union
-import colossalai
+import torch.distributed as dist
 from colossalai.context.parallel_mode import ParallelMode
 class DistributedLogger:
@ -63,6 +62,7 @@ class DistributedLogger:
            self._logger.propagate = False
            DistributedLogger.__instances[name] = self
        self.rank = dist.get_rank() if dist.is_initialized() else 0
    @staticmethod
    def __get_call_info():
@ -109,16 +109,10 @@ class DistributedLogger:
        # create log directory
        path.mkdir(parents=True, exist_ok=True)
        # set the default file name if path is a directory
        if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
            rank = 0
        else:
            rank = colossalai.core.global_context.get_global_rank()
        if suffix is not None:
-            log_file_name = f'rank_{rank}_{suffix}.log'
+            log_file_name = f'rank_{self.rank}_{suffix}.log'
        else:
-            log_file_name = f'rank_{rank}.log'
+            log_file_name = f'rank_{self.rank}.log'
        path = path.joinpath(log_file_name)
        # add file handler
@ -128,19 +122,14 @@ class DistributedLogger:
        file_handler.setFormatter(formatter)
        self._logger.addHandler(file_handler)
-    def _log(self,
+    def _log(self, level, message: str, ranks: List[int] = None) -> None:
             level,
             message: str,
             parallel_mode: ParallelMode = ParallelMode.GLOBAL,
             ranks: List[int] = None) -> None:
        if ranks is None:
            getattr(self._logger, level)(message)
        else:
-            local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
+            if self.rank in ranks:
            if local_rank in ranks:
                getattr(self._logger, level)(message)
-    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def info(self, message: str, ranks: List[int] = None) -> None:
        """Log an info message.
        Args:
@ -150,10 +139,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('info', message_prefix, parallel_mode, ranks)
+        self._log('info', message_prefix, ranks)
-        self._log('info', message, parallel_mode, ranks)
+        self._log('info', message, ranks)
-    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def warning(self, message: str, ranks: List[int] = None) -> None:
        """Log a warning message.
        Args:
@ -163,10 +152,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('warning', message_prefix, parallel_mode, ranks)
+        self._log('warning', message_prefix, ranks)
-        self._log('warning', message, parallel_mode, ranks)
+        self._log('warning', message, ranks)
-    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def debug(self, message: str, ranks: List[int] = None) -> None:
        """Log a debug message.
        Args:
@ -176,10 +165,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('debug', message_prefix, parallel_mode, ranks)
+        self._log('debug', message_prefix, ranks)
-        self._log('debug', message, parallel_mode, ranks)
+        self._log('debug', message, ranks)
-    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def error(self, message: str, ranks: List[int] = None) -> None:
        """Log an error message.
        Args:
@ -189,5 +178,5 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('error', message_prefix, parallel_mode, ranks)
+        self._log('error', message_prefix, ranks)
-        self._log('error', message, parallel_mode, ranks)
+        self._log('error', message, ranks)
--- a/colossalai/nn/init.py
+++ b/colossalai/nn/init.py
@ -1,6 +1,5 @@
-from ._ops import *
+from .init import *
 from .layer import *
 from .loss import *
 from .lr_scheduler import *
 from .metric import *
 from .optimizer import *
--- a/colossalai/nn/layer/init.py
+++ b/colossalai/nn/layer/init.py
@ -1,10 +1,2 @@
 from .colossalai_layer import *
 from .parallel_1d import *
 from .parallel_2d import *
 from .parallel_2p5d import *
 from .parallel_3d import *
 from .parallel_sequence import *
 from .moe import *
 from .utils import *
 from .vanilla import *
 from .wrapper import *
--- a/colossalai/nn/layer/parallel_1d/init.py
+++ b/colossalai/nn/layer/parallel_1d/init.py
@ -1,7 +0,0 @@
 from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
                     PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
 __all__ = [
    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
 ]
--- a/colossalai/nn/layer/utils.py
+++ b/colossalai/nn/layer/utils.py
@ -0,0 +1,14 @@
 def divide(numerator, denominator):
    """Only allow exact division.
    Args:
        numerator (int): Numerator of the division.
        denominator (int): Denominator of the division.
    Returns:
        int: the result of exact division.
    """
    assert denominator != 0, 'denominator can not be zero'
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator
--- a/colossalai/nn/layer/utils/init.py
+++ b/colossalai/nn/layer/utils/init.py
@ -1,7 +0,0 @@
 from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
                     set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
 __all__ = [
    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
 ]
--- a/colossalai/nn/loss/init.py
+++ b/colossalai/nn/loss/init.py
@ -1,41 +1 @@
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.nn.layer.utils import get_tensor_parallel_mode
 from torch import nn
 from torch.nn.modules.loss import *
 from torch.nn.modules.loss import _Loss
 from .loss_1d import VocabParallelCrossEntropyLoss1D
 from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
 from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
 from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
 from .loss_moe import MoeCrossEntropyLoss, MoeLoss
 _parallel_cross_entropy = {
    '2d': CrossEntropyLoss2D,
    '2.5d': CrossEntropyLoss2p5D,
    '3d': CrossEntropyLoss3D,
 }
 _vocab_parallel_cross_entropy = {
    '1d': VocabParallelCrossEntropyLoss1D,
    '2d': VocabParallelCrossEntropyLoss2D,
    '2.5d': VocabParallelCrossEntropyLoss2p5D,
    '3d': VocabParallelCrossEntropyLoss3D,
 }
 class CrossEntropyLoss(_Loss):
    def __init__(self, reduction: bool = True, *args, **kwargs):
        super().__init__()
        tensor_parallel = get_tensor_parallel_mode()
        if tensor_parallel is not None and env.vocab_parallel:
            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
        elif tensor_parallel is None or tensor_parallel == '1d':
            reduction = 'mean' if reduction else 'none'
            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
        else:
            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
    def forward(self, *args):
        return self.loss(*args)
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
 from colossalai.legacy.registry import LR_SCHEDULERS
 from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
@LR_SCHEDULERS.register_module
 class CosineAnnealingLR(_CosineAnnealingLR):
    r"""Set the learning rate of each parameter group using a cosine annealing
    schedule, where :math:`\eta_{max}` is set to the initial lr and
@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
        super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
 class CosineAnnealingWarmupLR(WarmupScheduler):
    """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
        super().__init__(optimizer, warmup_steps, base_scheduler)
@LR_SCHEDULERS.register_module
 class FlatAnnealingLR(DelayerScheduler):
    """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
        super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
    applied, and then the learning rate will be a fixed value before starting decay.
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
    """Linearly warmup learning rate and then linearly decay.
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@ -2,12 +2,9 @@ from typing import List
 from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
 from colossalai.legacy.registry import LR_SCHEDULERS
 from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
 class MultiStepLR(_MultiStepLR):
    """Decays the learning rate of each parameter group by gamma once the
    number of epoch reaches one of the milestones. Notice that such decay can
@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
        super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
    """Multistep learning rate scheduler with warmup.
--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
 from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
 class OneCycleLR(_OneCycleLR):
    r"""Sets the learning rate of each parameter group according to the
    1cycle learning rate policy. The 1cycle policy anneals the learning
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from colossalai.legacy.registry import LR_SCHEDULERS
 from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.
@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
                for base_lr in self.base_lrs]
@LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR
 from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
 class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.
@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
    in the specified function. When last_epoch=-1, sets initial lr as lr.
@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
 class StepLR(_StepLR):
    """Decays the learning rate of each parameter group by gamma every
    step_size epochs. Notice that such decay can happen simultaneously with
@ -61,7 +56,6 @@ class StepLR(_StepLR):
        super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
 class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr
--- a/Show More
+++ b/Show More
`@ -1,4 +1,5 @@`
	`import abc`	`import abc`

	`import torch.nn as nn`	`import torch.nn as nn`