[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
2023-09-11 16:24:28 +08:00 · 2023-09-11 16:24:28 +08:00 · 554aa9592e
parent 536397cc95
commit 554aa9592e
170 changed files with 781 additions and 758 deletions
--- a/colossalai/auto_parallel/offload/base_offload_module.py
+++ b/colossalai/auto_parallel/offload/base_offload_module.py
@ -4,7 +4,7 @@ from typing import Optional, Set
 import torch
 import torch.nn as nn

-from colossalai.nn.parallel.data_parallel import _cast_float
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.tensor_utils import free_storage

 from .region_manager import RegionManager
--- a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
@ -1,5 +1,4 @@
 class Registry:
-    # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here

    def __init__(self, name):
        self.name = name
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple
 import torch
 import torch.nn as nn
 from torch.optim import Optimizer
-from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
-from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model

 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.nn.optimizer import ColossalaiOptimizer
@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
        checkpoint_path (str): Path to the checkpoint directory.
        is_master (bool): Whether current rank is main process.
    """
+    try:
+        from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
+        from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
+    except ImportError:
+        return
    if not isinstance(model, PreTrainedModel):
        return

--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
@ -1,6 +1,6 @@
 import torch

-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn


 class MLP(torch.nn.Module):
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@ -1,6 +1,6 @@
 import torch

-from colossalai.nn.layer.colossalai_layer import Embedding, Linear
+from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 from colossalai.utils import get_current_device

 from .bias_dropout_add import bias_dropout_add_fused_train
--- a/colossalai/legacy/communication/init.py
+++ b/colossalai/legacy/communication/init.py
@ -1,9 +1,17 @@
-from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
-from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
-                  send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
-                  recv_forward, recv_backward)
+from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter
+from .p2p import (
+    recv_backward,
+    recv_forward,
+    send_backward,
+    send_backward_recv_backward,
+    send_backward_recv_forward,
+    send_forward,
+    send_forward_backward_recv_forward_backward,
+    send_forward_recv_backward,
+    send_forward_recv_forward,
+)
 from .ring import ring_forward
-from .utils import send_obj_meta, recv_obj_meta
+from .utils import recv_obj_meta, send_obj_meta

 __all__ = [
    'all_gather',
--- a/colossalai/legacy/communication/collective.py
+++ b/colossalai/legacy/communication/collective.py
--- a/colossalai/legacy/communication/p2p.py
+++ b/colossalai/legacy/communication/p2p.py
--- a/colossalai/legacy/communication/p2p_v2.py
+++ b/colossalai/legacy/communication/p2p_v2.py
--- a/colossalai/legacy/communication/ring.py
+++ b/colossalai/legacy/communication/ring.py
--- a/colossalai/legacy/communication/utils.py
+++ b/colossalai/legacy/communication/utils.py
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union

 import torch.cuda

-import colossalai.communication as comm
+import colossalai.legacy.communication as comm
 from colossalai.amp.naive_amp import NaiveAMPModel
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
@ -5,10 +5,10 @@ from typing import Iterable, Tuple

 import torch.cuda

-import colossalai.communication.p2p_v2 as comm
-from colossalai import engine
+import colossalai.legacy.communication.p2p_v2 as comm
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.engine import Engine
 from colossalai.utils.cuda import get_current_device

 from ._pipeline_schedule import PipelineSchedule
@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule):
    """

    def forward_backward_step(self,
-                              engine: engine.Engine,
+                              engine: Engine,
                              data_iter: Iterable,
                              forward_only=False,
                              return_loss=True,
--- a/colossalai/legacy/nn/init.py
+++ b/colossalai/legacy/nn/init.py
@ -0,0 +1,4 @@
+from ._ops import *
+from .layer import *
+from .loss import *
+from .metric import *
--- a/colossalai/legacy/nn/_ops/init.py
+++ b/colossalai/legacy/nn/_ops/init.py
--- a/colossalai/legacy/nn/_ops/_utils.py
+++ b/colossalai/legacy/nn/_ops/_utils.py
@ -4,7 +4,7 @@ import torch
 import torch.distributed as dist

 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import divide
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup

 GeneralTensor = Union[ColoTensor, torch.Tensor]
@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
    return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)


-### table wise embedding shard
+# table wise embedding shard


 def _all_to_all_for_tablewise(x: torch.Tensor,
--- a/colossalai/legacy/nn/_ops/addmm.py
+++ b/colossalai/legacy/nn/_ops/addmm.py
--- a/colossalai/legacy/nn/_ops/batch_norm.py
+++ b/colossalai/legacy/nn/_ops/batch_norm.py
--- a/colossalai/legacy/nn/_ops/element_wise.py
+++ b/colossalai/legacy/nn/_ops/element_wise.py
--- a/colossalai/legacy/nn/_ops/embedding.py
+++ b/colossalai/legacy/nn/_ops/embedding.py
@ -1,8 +1,10 @@
-import torch.nn.functional as F
 from typing import Optional
+
+import torch.nn.functional as F
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
-    ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input


--- a/colossalai/legacy/nn/_ops/embedding_bag.py
+++ b/colossalai/legacy/nn/_ops/embedding_bag.py
@ -1,9 +1,11 @@
-import torch.nn.functional as F
 from typing import Optional
+
+import torch.nn.functional as F
 from torch import Tensor
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
-    ShardSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor


--- a/colossalai/legacy/nn/_ops/layernorm.py
+++ b/colossalai/legacy/nn/_ops/layernorm.py
@ -1,7 +1,10 @@
 from typing import List, Optional
+
 import torch.nn.functional as F
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor


--- a/colossalai/legacy/nn/_ops/linear.py
+++ b/colossalai/legacy/nn/_ops/linear.py
--- a/colossalai/legacy/nn/_ops/loss.py
+++ b/colossalai/legacy/nn/_ops/loss.py
@ -1,9 +1,12 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
-from typing import Optional
-from colossalai.tensor.op_wrapper import colo_op_impl
+
+from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
 from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
+from colossalai.tensor.op_wrapper import colo_op_impl
+
 from ._utils import GeneralTensor, convert_to_colo_tensor


--- a/colossalai/legacy/nn/_ops/view.py
+++ b/colossalai/legacy/nn/_ops/view.py
--- a/colossalai/legacy/nn/layer/init.py
+++ b/colossalai/legacy/nn/layer/init.py
@ -0,0 +1,9 @@
+from .colossalai_layer import *
+from .parallel_1d import *
+from .parallel_2d import *
+from .parallel_2p5d import *
+from .parallel_3d import *
+from .parallel_sequence import *
+from .utils import *
+from .vanilla import *
+from .wrapper import *
--- a/colossalai/legacy/nn/layer/base_layer.py
+++ b/colossalai/legacy/nn/layer/base_layer.py
--- a/colossalai/legacy/nn/layer/colossalai_layer/init.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/init.py
@ -1,7 +1,7 @@
-from ._utils import partition_batch
-from .dropout import Dropout
-from .embedding import Embedding, PatchEmbedding
-from .linear import Classifier, Linear
-from .normalization import LayerNorm
-
-__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
+from ._utils import partition_batch
+from .dropout import Dropout
+from .embedding import Embedding, PatchEmbedding
+from .linear import Classifier, Linear
+from .normalization import LayerNorm
+
+__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
--- a/colossalai/legacy/nn/layer/colossalai_layer/_utils.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/_utils.py
--- a/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
--- a/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
@ -1,151 +1,152 @@
-import math
-from typing import Callable
-
-from colossalai.utils import get_current_device
-from torch import dtype, nn
-
-from ... import init as init
-from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
-from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
-from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
-from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
-from ..utils import get_tensor_parallel_mode
-from ..vanilla import VanillaPatchEmbedding
-from ._utils import ColossalaiModule
-
-_parallel_embedding = {
-    '1d': Embedding1D,
-    '2d': Embedding2D,
-    '2.5d': Embedding2p5D,
-    '3d': Embedding3D,
-}
-
-_vocab_parallel_embedding = {
-    '1d': VocabParallelEmbedding1D,
-    '2d': VocabParallelEmbedding2D,
-    '2.5d': VocabParallelEmbedding2p5D,
-    '3d': VocabParallelEmbedding3D
-}
-
-_parallel_patchembedding = {
-    None: VanillaPatchEmbedding,
-    '1d': PatchEmbedding1D,
-    '2d': PatchEmbedding2D,
-    '2.5d': PatchEmbedding2p5D,
-    '3d': PatchEmbedding3D
-}
-
-
-class Embedding(ColossalaiModule):
-    r"""Embedding for colossalai.
-
-    Args:
-        num_embeddings (int): number of embeddings.
-        embedding_dim (int): dimension of embedding.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            therefore, the embedding vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”, defaults to None.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            he initializer of weight, defaults to normal initializer.
-
-    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    ::
-
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-                    of frequency of the words in the mini-batch. Default False.
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-
-    More details about ``args`` and ``kwargs`` could be found in
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 weight_initializer: Callable = init.normal_(),
-                 vocab_parallel_limit: int = 2048,
-                 *args,
-                 **kwargs) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is None:
-            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
-                                 **kwargs).to(dtype).to(get_current_device())
-            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
-        elif num_embeddings <= vocab_parallel_limit:
-            embed = _parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-        else:
-            embed = _vocab_parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-        super().__init__(embed)
-
-
-class PatchEmbedding(ColossalaiModule):
-    """2D Image to Patch Embedding.
-
-    Args:
-        img_size (int): image size.
-        patch_size (int): patch size.
-        in_chans (int): number of channels of input image.
-        embed_size (int): size of embedding.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-        position_embed_initializer (:class:`typing.Callable`, optional):
-            The initializer of position embedding, defaults to zeros initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(
-        self,
-        img_size: int,
-        patch_size: int,
-        in_chans: int,
-        embed_size: int,
-        dtype: dtype = None,
-        flatten: bool = True,
-        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-        position_embed_initializer: Callable = init.zeros_()
-    ) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        embed = _parallel_patchembedding[tensor_parallel](
-            img_size,
-            patch_size,
-            in_chans,
-            embed_size,
-            dtype=dtype,
-            flatten=flatten,
-            weight_initializer=weight_initializer,
-            bias_initializer=bias_initializer,
-            position_embed_initializer=position_embed_initializer,
-        )
-        super().__init__(embed)
+import math
+from typing import Callable
+
+from torch import dtype, nn
+
+from colossalai.nn import init
+from colossalai.utils import get_current_device
+
+from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
+from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
+from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
+from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import VanillaPatchEmbedding
+from ._utils import ColossalaiModule
+
+_parallel_embedding = {
+    '1d': Embedding1D,
+    '2d': Embedding2D,
+    '2.5d': Embedding2p5D,
+    '3d': Embedding3D,
+}
+
+_vocab_parallel_embedding = {
+    '1d': VocabParallelEmbedding1D,
+    '2d': VocabParallelEmbedding2D,
+    '2.5d': VocabParallelEmbedding2p5D,
+    '3d': VocabParallelEmbedding3D
+}
+
+_parallel_patchembedding = {
+    None: VanillaPatchEmbedding,
+    '1d': PatchEmbedding1D,
+    '2d': PatchEmbedding2D,
+    '2.5d': PatchEmbedding2p5D,
+    '3d': PatchEmbedding3D
+}
+
+
+class Embedding(ColossalaiModule):
+    r"""Embedding for colossalai.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 vocab_parallel_limit: int = 2048,
+                 *args,
+                 **kwargs) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is None:
+            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
+                                 **kwargs).to(dtype).to(get_current_device())
+            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
+        elif num_embeddings <= vocab_parallel_limit:
+            embed = _parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+        else:
+            embed = _vocab_parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+        super().__init__(embed)
+
+
+class PatchEmbedding(ColossalaiModule):
+    """2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_size: int,
+        dtype: dtype = None,
+        flatten: bool = True,
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        position_embed_initializer: Callable = init.zeros_()
+    ) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        embed = _parallel_patchembedding[tensor_parallel](
+            img_size,
+            patch_size,
+            in_chans,
+            embed_size,
+            dtype=dtype,
+            flatten=flatten,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            position_embed_initializer=position_embed_initializer,
+        )
+        super().__init__(embed)
--- a/colossalai/legacy/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/linear.py
@ -4,9 +4,9 @@ from typing import Callable

 from torch import dtype, nn

+from colossalai.nn import init
 from colossalai.utils import get_current_device

-from ... import init as init
 from ..parallel_1d import *
 from ..parallel_2d import *
 from ..parallel_2p5d import *
--- a/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
@ -1,41 +1,42 @@
-from colossalai.utils import get_current_device
-from torch import nn
-
-from ..parallel_1d import LayerNorm1D
-from ..parallel_2d import LayerNorm2D
-from ..parallel_2p5d import LayerNorm2p5D
-from ..parallel_3d import LayerNorm3D
-from ..utils import get_tensor_parallel_mode
-from ..vanilla import VanillaLayerNorm
-from ._utils import ColossalaiModule
-
-_parallel_layernorm = {
-    None: VanillaLayerNorm,
-    "1d": LayerNorm1D,
-    "2d": LayerNorm2D,
-    "2.5d": LayerNorm2p5D,
-    "3d": LayerNorm3D,
-}
-
-
-class LayerNorm(ColossalaiModule):
-    r"""Layer Normalization for colossalai.
-
-    Args:
-        normalized_shape (int): input shape from an expected input of size.
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-            \times \ldots \times \text{normalized_shape}[-1]]`
-            If a single integer is used, it is treated as a singleton list, and this module will
-            normalize over the last dimension which is expected to be of that specific size.
-        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
-        bias (bool, optional): Whether to add a bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    """
-
-    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is None:
-            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
-        else:
-            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
-        super().__init__(norm)
+from torch import nn
+
+from colossalai.utils import get_current_device
+
+from ..parallel_1d import LayerNorm1D
+from ..parallel_2d import LayerNorm2D
+from ..parallel_2p5d import LayerNorm2p5D
+from ..parallel_3d import LayerNorm3D
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import VanillaLayerNorm
+from ._utils import ColossalaiModule
+
+_parallel_layernorm = {
+    None: VanillaLayerNorm,
+    "1d": LayerNorm1D,
+    "2d": LayerNorm2D,
+    "2.5d": LayerNorm2p5D,
+    "3d": LayerNorm3D,
+}
+
+
+class LayerNorm(ColossalaiModule):
+    r"""Layer Normalization for colossalai.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
+        bias (bool, optional): Whether to add a bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+    """
+
+    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is None:
+            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
+        else:
+            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+        super().__init__(norm)
--- a/colossalai/legacy/nn/layer/parallel_1d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/init.py
@ -0,0 +1,17 @@
+from .layers import (
+    Classifier1D,
+    Dropout1D,
+    Embedding1D,
+    LayerNorm1D,
+    Linear1D,
+    Linear1D_Col,
+    Linear1D_Row,
+    PatchEmbedding1D,
+    VocabParallelClassifier1D,
+    VocabParallelEmbedding1D,
+)
+
+__all__ = [
+    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
+    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
+]
--- a/colossalai/legacy/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
--- a/colossalai/legacy/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
@ -3,6 +3,7 @@

 import torch
 import torch.distributed as dist
+
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env

@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
 class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
-    
+
    Args:
        input_: input matrix.
        parallel_mode: parallel mode.
--- a/colossalai/legacy/nn/layer/parallel_1d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/layers.py
@ -10,11 +10,11 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn.parameter import Parameter

-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (
--- a/colossalai/legacy/nn/layer/parallel_2d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/init.py
@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2d, split_batch_2d
-from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
-                     VocabParallelEmbedding2D)
+from .layers import (
+    Classifier2D,
+    Embedding2D,
+    LayerNorm2D,
+    Linear2D,
+    PatchEmbedding2D,
+    VocabParallelClassifier2D,
+    VocabParallelEmbedding2D,
+)

 __all__ = [
    'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple

 import torch
 import torch.distributed as dist
-from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
+from colossalai.utils import get_current_device


 def matmul_2d(
@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)

        src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size

        opa = [None] * 2
        opb = [None] * 2
@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)

        src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size

        opb = [None] * 2
        opr = [None] * 2
@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)

        src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size

        opa = [None] * 2
        opr = [None] * 2
--- a/colossalai/legacy/nn/layer/parallel_2d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_2d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/layers.py
@ -8,10 +8,10 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter

-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
--- a/colossalai/legacy/nn/layer/parallel_2p5d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/init.py
@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
-from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
-                     VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
+from .layers import (
+    Classifier2p5D,
+    Embedding2p5D,
+    LayerNorm2p5D,
+    Linear2p5D,
+    PatchEmbedding2p5D,
+    VocabParallelClassifier2p5D,
+    VocabParallelEmbedding2p5D,
+)

 __all__ = [
    'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@ -2,13 +2,14 @@ from typing import Any, Tuple

 import torch
 import torch.distributed as dist
-from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd

+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
+from colossalai.utils import get_current_device
+

 def get_parallel_group(parallel_mode: ParallelMode):
    return gpc.get_group(parallel_mode)
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
@ -8,10 +8,10 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter

-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (
--- a/colossalai/legacy/nn/layer/parallel_3d/init.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/init.py
@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
-from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
-                     VocabParallelEmbedding3D)
+from .layers import (
+    Classifier3D,
+    Embedding3D,
+    LayerNorm3D,
+    Linear3D,
+    PatchEmbedding3D,
+    VocabParallelClassifier3D,
+    VocabParallelEmbedding3D,
+)

 __all__ = [
    'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@ -7,10 +7,10 @@ import torch
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter

 from ._utils import get_parallel_mode_from_env, push_async_grad

--- a/colossalai/legacy/nn/layer/parallel_3d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_3d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py
@ -8,14 +8,14 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter

-from colossalai.communication import all_reduce, broadcast
 from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication import all_reduce, broadcast
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.nn.layer.base_layer import ParallelLayer
 from colossalai.utils.checkpointing import (
    broadcast_state_dict,
    gather_tensor_parallel_state_dict,
--- a/colossalai/legacy/nn/layer/parallel_sequence/init.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/init.py
@ -1,4 +1,4 @@
-from ._operation import RingQK, RingAV
+from ._operation import RingAV, RingQK
 from .layers import TransformerSelfAttentionRing

 __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@ -3,13 +3,13 @@

 import torch
 from torch import distributed as dist
+from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.communication import ring_forward
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range
+from colossalai.legacy.communication import ring_forward
+from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
 from colossalai.utils import get_current_device
-from torch.cuda.amp import custom_bwd, custom_fwd


 class RingQK(torch.autograd.Function):
--- a/colossalai/legacy/nn/layer/parallel_sequence/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_utils.py
--- a/colossalai/legacy/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.kernel import FusedScaleMaskSoftmax
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK


@LAYERS.register_module
--- a/colossalai/legacy/nn/layer/utils/init.py
+++ b/colossalai/legacy/nn/layer/utils/init.py
@ -0,0 +1,15 @@
+from .common import (
+    ACT2FN,
+    CheckpointModule,
+    _ntuple,
+    divide,
+    get_tensor_parallel_mode,
+    set_tensor_parallel_attribute_by_partition,
+    set_tensor_parallel_attribute_by_size,
+    to_2tuple,
+)
+
+__all__ = [
+    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
+    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
+]
--- a/colossalai/legacy/nn/layer/utils/common.py
+++ b/colossalai/legacy/nn/layer/utils/common.py
@ -6,10 +6,11 @@ from itertools import repeat

 import numpy as np
 import torch
+from torch import Tensor, nn
+
 from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.utils import checkpoint
-from torch import Tensor, nn


 class CheckpointModule(nn.Module):
--- a/colossalai/legacy/nn/layer/vanilla/init.py
+++ b/colossalai/legacy/nn/layer/vanilla/init.py
--- a/colossalai/legacy/nn/layer/vanilla/layers.py
+++ b/colossalai/legacy/nn/layer/vanilla/layers.py
--- a/colossalai/legacy/nn/layer/wrapper/init.py
+++ b/colossalai/legacy/nn/layer/wrapper/init.py
--- a/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
@ -1,6 +1,8 @@
-import torch.nn as nn
-import torch.distributed as dist
 from typing import List, Tuple, Union
+
+import torch.distributed as dist
+import torch.nn as nn
+
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc

--- a/colossalai/legacy/nn/loss/init.py
+++ b/colossalai/legacy/nn/loss/init.py
@ -0,0 +1,41 @@
+from torch import nn
+from torch.nn.modules.loss import *
+from torch.nn.modules.loss import _Loss
+
+from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from .loss_1d import VocabParallelCrossEntropyLoss1D
+from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
+from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
+from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
+
+_parallel_cross_entropy = {
+    '2d': CrossEntropyLoss2D,
+    '2.5d': CrossEntropyLoss2p5D,
+    '3d': CrossEntropyLoss3D,
+}
+
+_vocab_parallel_cross_entropy = {
+    '1d': VocabParallelCrossEntropyLoss1D,
+    '2d': VocabParallelCrossEntropyLoss2D,
+    '2.5d': VocabParallelCrossEntropyLoss2p5D,
+    '3d': VocabParallelCrossEntropyLoss3D,
+}
+
+
+class CrossEntropyLoss(_Loss):
+
+    def __init__(self, reduction: bool = True, *args, **kwargs):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is not None and env.vocab_parallel:
+            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+        elif tensor_parallel is None or tensor_parallel == '1d':
+            reduction = 'mean' if reduction else 'none'
+            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
+        else:
+            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+
+    def forward(self, *args):
+        return self.loss(*args)
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss

 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
-from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.utils import get_current_device


--- a/colossalai/legacy/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss

 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
-from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.utils import get_current_device


--- a/colossalai/legacy/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss

 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.utils import get_current_device


--- a/colossalai/legacy/nn/metric/init.py
+++ b/colossalai/legacy/nn/metric/init.py
@ -1,26 +1,28 @@
-from torch import nn
-
-from ._utils import calc_acc
-from .accuracy_2d import Accuracy2D
-from .accuracy_2p5d import Accuracy2p5D
-from .accuracy_3d import Accuracy3D
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-
-_parallel_accuracy = {
-    '2d': Accuracy2D,
-    '2.5d': Accuracy2p5D,
-    '3d': Accuracy3D,
-}
-
-
-class Accuracy(nn.Module):
-    def __init__(self):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel not in _parallel_accuracy:
-            self.acc = calc_acc
-        else:
-            self.acc = _parallel_accuracy[tensor_parallel]()
-
-    def forward(self, *args):
-        return self.acc(*args)
+from torch import nn
+
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from ._utils import calc_acc
+from .accuracy_2d import Accuracy2D
+from .accuracy_2p5d import Accuracy2p5D
+from .accuracy_3d import Accuracy3D
+
+_parallel_accuracy = {
+    '2d': Accuracy2D,
+    '2.5d': Accuracy2p5D,
+    '3d': Accuracy3D,
+}
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel not in _parallel_accuracy:
+            self.acc = calc_acc
+        else:
+            self.acc = _parallel_accuracy[tensor_parallel]()
+
+    def forward(self, *args):
+        return self.acc(*args)
--- a/colossalai/legacy/nn/metric/_utils.py
+++ b/colossalai/legacy/nn/metric/_utils.py
@ -1,7 +1,7 @@
-import torch
-
-
-def calc_acc(logits, targets):
-    preds = torch.argmax(logits, dim=-1)
-    correct = torch.sum(targets == preds)
-    return correct
+import torch
+
+
+def calc_acc(logits, targets):
+    preds = torch.argmax(logits, dim=-1)
+    correct = torch.sum(targets == preds)
+    return correct
--- a/colossalai/legacy/nn/metric/accuracy_2d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2d.py
@ -1,7 +1,8 @@
 import torch
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from torch import nn

+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+
 from ._utils import calc_acc


--- a/colossalai/legacy/nn/metric/accuracy_2p5d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2p5d.py
@ -1,7 +1,8 @@
 import torch
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from torch import nn

+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+
 from ._utils import calc_acc


--- a/colossalai/legacy/nn/metric/accuracy_3d.py
+++ b/colossalai/legacy/nn/metric/accuracy_3d.py
@ -1,33 +1,35 @@
-import torch
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
-from torch import nn
-
-from ._utils import calc_acc
-
-
-class Accuracy3D(nn.Module):
-    """Accuracy for 3D parallelism
-    """
-    def __init__(self):
-        super().__init__()
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-
-    def forward(self, logits, targets):
-        """Calculate the accuracy of predicted labels.
-
-        Args:
-            logits (:class:`torch.tensor`): Predicted labels.
-            targets (:class:`torch.tensor`): True labels from data.
-
-        Returns:
-            float: the accuracy of prediction.
-         """
-        with torch.no_grad():
-            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
-            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
-            correct = calc_acc(logits, targets)
-            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
-        return correct
+import torch
+from torch import nn
+
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+
+from ._utils import calc_acc
+
+
+class Accuracy3D(nn.Module):
+    """Accuracy for 3D parallelism
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+
+    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
+         """
+        with torch.no_grad():
+            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
+            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
+            correct = calc_acc(logits, targets)
+            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
+        return correct
--- a/colossalai/legacy/nn/parallel/init.py
+++ b/colossalai/legacy/nn/parallel/init.py
--- a/colossalai/legacy/nn/parallel/data_parallel.py
+++ b/colossalai/legacy/nn/parallel/data_parallel.py
--- a/colossalai/legacy/nn/parallel/layers/init.py
+++ b/colossalai/legacy/nn/parallel/layers/init.py
@ -1,10 +1,17 @@
+from .cache_embedding import (
+    CachedEmbeddingBag,
+    CachedParamMgr,
+    EvictionStrategy,
+    LimitBuffIndexCopyer,
+    ParallelCachedEmbeddingBag,
+    ParallelCachedEmbeddingBagTablewise,
+    ParallelCachedEmbeddingBagTablewiseSpiltCache,
+    TablewiseEmbeddingBagConfig,
+)
 from .colo_module import ColoModule
-from .linear import ColoLinear
 from .embedding import ColoEmbedding
-from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
-
-from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
-    ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
+from .linear import ColoLinear
+from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module

 __all__ = [
    'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/init.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/init.py
@ -1,8 +1,8 @@
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from .copyer import LimitBuffIndexCopyer
 from .cached_embedding import CachedEmbeddingBag
-from .parallel_cached_embedding import ParallelCachedEmbeddingBag
+from .copyer import LimitBuffIndexCopyer
 from .embedding_config import TablewiseEmbeddingBagConfig
+from .parallel_cached_embedding import ParallelCachedEmbeddingBag
 from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
 from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache

--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
@ -1,4 +1,5 @@
 import abc
+
 import torch.nn as nn


--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
@ -1,12 +1,14 @@
-import numpy as np
-import torch
-from torch.profiler import record_function
-from typing import List, Optional
-from contexttimer import Timer
-from .copyer import LimitBuffIndexCopyer
-from enum import Enum
 import sys
 from contextlib import contextmanager
+from enum import Enum
+from typing import List, Optional
+
+import numpy as np
+import torch
+from contexttimer import Timer
+from torch.profiler import record_function
+
+from .copyer import LimitBuffIndexCopyer


 class EvictionStrategy(Enum):
@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
 class CachedParamMgr(torch.nn.Module):
    """
    Manage Embedding Weights on CPU and CUDA memory uses a software cache.
-    CPU maintains the entire original weight. 
+    CPU maintains the entire original weight.
    CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
    During training, GPU needs to transmit embedding rows between CPU and GPU.
    Args:
@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
        self._elapsed_dict[name] += t.elapsed

    def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
-        """_find_evict_gpu_idxs 
+        """_find_evict_gpu_idxs
        Find the gpu idxs to be evicted, according to their freq.
        Args:
            evict_num (int): how many rows has to be evicted
@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
        """reorder
        reorder the weight according to ids' frequency in dataset before training.
        Execute only once before training, also known as warmup phase.
-        
+
        Note:
            If you would like to use the DATASET as the eviction strategy, you must call this function.
        Note:
@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
        """
        deprecated
        evict one row from cuda to cpu.
-        Returns: 
+        Returns:
        (int) : the slot id be evicted.
        """
        mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
@ -1,10 +1,11 @@
+from typing import Iterator, List, Optional, Tuple, Union
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple, Union
+from torch.nn.parameter import Parameter

 from .base_embedding import BaseEmbeddingBag
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from torch.nn.parameter import Parameter


 class CachedEmbeddingBag(BaseEmbeddingBag):
@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
        include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
        dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
        device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
-        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
+        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
        ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
        warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
        buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
                    buffer_size=50_000,
                    pin_weight=False):
        """
-        Called after initialized. 
+        Called after initialized.
        Reorder the weight rows according to the ids_freq_mapping.
        Then, let the weights of the Module be managed by a CachedParamMgr.
-        
+
        Args:
            cuda_row_num (int): number of rows can be hosted in CUDA memory
            ids_freq_mapping (List[int]): a list, idx is id number, value is freq
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
@ -3,7 +3,7 @@ from torch import LongTensor


 class LimitBuffIndexCopyer(object):
-    """LimitBuffIndexCopyer 
+    """LimitBuffIndexCopyer
    Index Copy using limited temp buffer on CUDA.

    Args:
@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):

    @torch.no_grad()
    def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
-        """copy 
+        """copy
        src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
        The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.

--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
@ -1,12 +1,13 @@
+from typing import Iterator, List, Optional, Tuple
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple

-from .cached_embedding import CachedEmbeddingBag
-from colossalai.nn._ops._utils import dual_all_to_all
+from colossalai.legacy.nn._ops._utils import dual_all_to_all
+from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec

-from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
 from .cache_mgr import CachedParamMgr, EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag


 def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
@ -1,15 +1,16 @@
+import time
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F

-from .cached_embedding import CachedEmbeddingBag
-from .cache_mgr import EvictionStrategy
-from .embedding_config import TablewiseEmbeddingBagConfig
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise

-from typing import List
-import time
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig


 class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
@ -1,17 +1,17 @@
+import abc
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.profiler import record_function

-from .cached_embedding import CachedEmbeddingBag
-
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise
-from .embedding_config import TablewiseEmbeddingBagConfig
-from .cache_mgr import EvictionStrategy

-from typing import List
-import abc
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig


 class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
--- a/colossalai/legacy/nn/parallel/layers/colo_module.py
+++ b/colossalai/legacy/nn/parallel/layers/colo_module.py
@ -1,6 +1,7 @@
-from colossalai.tensor.distspec import _DistSpec
+from typing import Dict, List
+
 from colossalai.tensor import ComputePattern
-from typing import List, Dict
+from colossalai.tensor.distspec import _DistSpec


 class ColoModule(object):
--- a/colossalai/legacy/nn/parallel/layers/embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/embedding.py
@ -1,5 +1,6 @@
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec


 class ColoEmbedding(ColoModule):
--- a/colossalai/legacy/nn/parallel/layers/linear.py
+++ b/colossalai/legacy/nn/parallel/layers/linear.py
@ -1,5 +1,6 @@
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec


 class ColoLinear(ColoModule):
--- a/colossalai/legacy/nn/parallel/layers/module_utils.py
+++ b/colossalai/legacy/nn/parallel/layers/module_utils.py
@ -1,9 +1,11 @@
 from typing import Dict
-from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
-from colossalai.tensor import distspec
-from . import ColoModule
+
 import torch

+from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
+
+from . import ColoModule
+
 _COLOSSAL_MODULES: Dict[type, ColoModule] = {}


--- a/colossalai/legacy/nn/parallel/reducer.py
+++ b/colossalai/legacy/nn/parallel/reducer.py
--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@ -7,9 +7,9 @@ from typing import Callable
 import torch
 import torch.distributed as dist

-from colossalai.communication import all_reduce
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.communication import all_reduce
 from colossalai.legacy.registry import HOOKS
 from colossalai.utils import get_current_device, is_no_pp_or_last_stage

--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@ -6,8 +6,7 @@ import logging
 from pathlib import Path
 from typing import List, Union

-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
+import torch.distributed as dist


 class DistributedLogger:
@ -63,6 +62,7 @@ class DistributedLogger:
            self._logger.propagate = False

            DistributedLogger.__instances[name] = self
+        self.rank = dist.get_rank() if dist.is_initialized() else 0

    @staticmethod
    def __get_call_info():
@ -109,16 +109,10 @@ class DistributedLogger:
        # create log directory
        path.mkdir(parents=True, exist_ok=True)

-        # set the default file name if path is a directory
-        if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
-            rank = 0
-        else:
-            rank = colossalai.core.global_context.get_global_rank()
-
        if suffix is not None:
-            log_file_name = f'rank_{rank}_{suffix}.log'
+            log_file_name = f'rank_{self.rank}_{suffix}.log'
        else:
-            log_file_name = f'rank_{rank}.log'
+            log_file_name = f'rank_{self.rank}.log'
        path = path.joinpath(log_file_name)

        # add file handler
@ -128,19 +122,14 @@ class DistributedLogger:
        file_handler.setFormatter(formatter)
        self._logger.addHandler(file_handler)

-    def _log(self,
-             level,
-             message: str,
-             parallel_mode: ParallelMode = ParallelMode.GLOBAL,
-             ranks: List[int] = None) -> None:
+    def _log(self, level, message: str, ranks: List[int] = None) -> None:
        if ranks is None:
            getattr(self._logger, level)(message)
        else:
-            local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
-            if local_rank in ranks:
+            if self.rank in ranks:
                getattr(self._logger, level)(message)

-    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def info(self, message: str, ranks: List[int] = None) -> None:
        """Log an info message.

        Args:
@ -150,10 +139,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('info', message_prefix, parallel_mode, ranks)
-        self._log('info', message, parallel_mode, ranks)
+        self._log('info', message_prefix, ranks)
+        self._log('info', message, ranks)

-    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def warning(self, message: str, ranks: List[int] = None) -> None:
        """Log a warning message.

        Args:
@ -163,10 +152,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('warning', message_prefix, parallel_mode, ranks)
-        self._log('warning', message, parallel_mode, ranks)
+        self._log('warning', message_prefix, ranks)
+        self._log('warning', message, ranks)

-    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def debug(self, message: str, ranks: List[int] = None) -> None:
        """Log a debug message.

        Args:
@ -176,10 +165,10 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('debug', message_prefix, parallel_mode, ranks)
-        self._log('debug', message, parallel_mode, ranks)
+        self._log('debug', message_prefix, ranks)
+        self._log('debug', message, ranks)

-    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def error(self, message: str, ranks: List[int] = None) -> None:
        """Log an error message.

        Args:
@ -189,5 +178,5 @@ class DistributedLogger:
            ranks (List[int]): List of parallel ranks.
        """
        message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('error', message_prefix, parallel_mode, ranks)
-        self._log('error', message, parallel_mode, ranks)
+        self._log('error', message_prefix, ranks)
+        self._log('error', message, ranks)
--- a/colossalai/nn/init.py
+++ b/colossalai/nn/init.py
@ -1,6 +1,5 @@
-from ._ops import *
+from .init import *
 from .layer import *
 from .loss import *
 from .lr_scheduler import *
-from .metric import *
 from .optimizer import *
--- a/colossalai/nn/layer/init.py
+++ b/colossalai/nn/layer/init.py
@ -1,10 +1,2 @@
-from .colossalai_layer import *
-from .parallel_1d import *
-from .parallel_2d import *
-from .parallel_2p5d import *
-from .parallel_3d import *
-from .parallel_sequence import *
 from .moe import *
 from .utils import *
-from .vanilla import *
-from .wrapper import *
--- a/colossalai/nn/layer/parallel_1d/init.py
+++ b/colossalai/nn/layer/parallel_1d/init.py
@ -1,7 +0,0 @@
-from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
-                     PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
-
-__all__ = [
-    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
-    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
-]
--- a/colossalai/nn/layer/utils.py
+++ b/colossalai/nn/layer/utils.py
@ -0,0 +1,14 @@
+def divide(numerator, denominator):
+    """Only allow exact division.
+
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
+    """
+    assert denominator != 0, 'denominator can not be zero'
+    assert numerator % denominator == 0, \
+        '{} is not divisible by {}'.format(numerator, denominator)
+    return numerator // denominator
--- a/colossalai/nn/layer/utils/init.py
+++ b/colossalai/nn/layer/utils/init.py
@ -1,7 +0,0 @@
-from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
-                     set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
-
-__all__ = [
-    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
-    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
-]
--- a/colossalai/nn/loss/init.py
+++ b/colossalai/nn/loss/init.py
@ -1,41 +1 @@
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-from torch import nn
-from torch.nn.modules.loss import *
-from torch.nn.modules.loss import _Loss
-
-from .loss_1d import VocabParallelCrossEntropyLoss1D
-from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
-from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
-from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
 from .loss_moe import MoeCrossEntropyLoss, MoeLoss
-
-_parallel_cross_entropy = {
-    '2d': CrossEntropyLoss2D,
-    '2.5d': CrossEntropyLoss2p5D,
-    '3d': CrossEntropyLoss3D,
-}
-
-_vocab_parallel_cross_entropy = {
-    '1d': VocabParallelCrossEntropyLoss1D,
-    '2d': VocabParallelCrossEntropyLoss2D,
-    '2.5d': VocabParallelCrossEntropyLoss2p5D,
-    '3d': VocabParallelCrossEntropyLoss3D,
-}
-
-
-class CrossEntropyLoss(_Loss):
-
-    def __init__(self, reduction: bool = True, *args, **kwargs):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is not None and env.vocab_parallel:
-            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-        elif tensor_parallel is None or tensor_parallel == '1d':
-            reduction = 'mean' if reduction else 'none'
-            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
-        else:
-            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-
-    def forward(self, *args):
-        return self.loss(*args)
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR

-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler


-@LR_SCHEDULERS.register_module
 class CosineAnnealingLR(_CosineAnnealingLR):
    r"""Set the learning rate of each parameter group using a cosine annealing
    schedule, where :math:`\eta_{max}` is set to the initial lr and
@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
        super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class CosineAnnealingWarmupLR(WarmupScheduler):
    """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.

@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
        super().__init__(optimizer, warmup_steps, base_scheduler)


-@LR_SCHEDULERS.register_module
 class FlatAnnealingLR(DelayerScheduler):
    """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.

@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
        super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
    applied, and then the learning rate will be a fixed value before starting decay.
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import _LRScheduler

-from colossalai.legacy.registry import LR_SCHEDULERS

-
-@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
    """Linearly warmup learning rate and then linearly decay.

--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@ -2,12 +2,9 @@ from typing import List

 from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR

-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler


-@LR_SCHEDULERS.register_module
 class MultiStepLR(_MultiStepLR):
    """Decays the learning rate of each parameter group by gamma once the
    number of epoch reaches one of the milestones. Notice that such decay can
@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
        super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
    """Multistep learning rate scheduler with warmup.

--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR

-from colossalai.legacy.registry import LR_SCHEDULERS

-
-@LR_SCHEDULERS.register_module
 class OneCycleLR(_OneCycleLR):
    r"""Sets the learning rate of each parameter group according to the
    1cycle learning rate policy. The 1cycle policy anneals the learning
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import _LRScheduler

-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler


-@LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.

@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
                for base_lr in self.base_lrs]


-@LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.

--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR

-from colossalai.legacy.registry import LR_SCHEDULERS

-
-@LR_SCHEDULERS.register_module
 class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.
@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
    in the specified function. When last_epoch=-1, sets initial lr as lr.
@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class StepLR(_StepLR):
    """Decays the learning rate of each parameter group by gamma every
    step_size epochs. Notice that such decay can happen simultaneously with
@ -61,7 +56,6 @@ class StepLR(_StepLR):
        super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)


-@LR_SCHEDULERS.register_module
 class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr
--- a/Show More
+++ b/Show More