diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py
index d0c328e13..5b9f74b13 100644
--- a/colossalai/auto_parallel/offload/base_offload_module.py
+++ b/colossalai/auto_parallel/offload/base_offload_module.py
@@ -4,7 +4,7 @@ from typing import Optional, Set
 import torch
 import torch.nn as nn
 
-from colossalai.nn.parallel.data_parallel import _cast_float
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.tensor_utils import free_storage
 
 from .region_manager import RegionManager
diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
index 1a90c72bd..730a90d74 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
@@ -1,5 +1,4 @@
 class Registry:
-    # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
 
     def __init__(self, name):
         self.name = name
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index 6dadaba3e..3441eca38 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple
 import torch
 import torch.nn as nn
 from torch.optim import Optimizer
-from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
-from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
 
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.nn.optimizer import ColossalaiOptimizer
@@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
         checkpoint_path (str): Path to the checkpoint directory.
         is_master (bool): Whether current rank is main process.
     """
+    try:
+        from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
+        from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
+    except ImportError:
+        return
     if not isinstance(model, PreTrainedModel):
         return
 
diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py
index f8fd1c41a..385b485b6 100644
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
@@ -1,6 +1,6 @@
 import torch
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 
 
 class MLP(torch.nn.Module):
diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py
index e20c08b05..8eb4e0c88 100644
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.nn.layer.colossalai_layer import Embedding, Linear
+from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 from colossalai.utils import get_current_device
 
 from .bias_dropout_add import bias_dropout_add_fused_train
diff --git a/colossalai/communication/__init__.py b/colossalai/legacy/communication/__init__.py
similarity index 53%
rename from colossalai/communication/__init__.py
rename to colossalai/legacy/communication/__init__.py
index 220481b7a..88ad0487b 100644
--- a/colossalai/communication/__init__.py
+++ b/colossalai/legacy/communication/__init__.py
@@ -1,9 +1,17 @@
-from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
-from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
-                  send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
-                  recv_forward, recv_backward)
+from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter
+from .p2p import (
+    recv_backward,
+    recv_forward,
+    send_backward,
+    send_backward_recv_backward,
+    send_backward_recv_forward,
+    send_forward,
+    send_forward_backward_recv_forward_backward,
+    send_forward_recv_backward,
+    send_forward_recv_forward,
+)
 from .ring import ring_forward
-from .utils import send_obj_meta, recv_obj_meta
+from .utils import recv_obj_meta, send_obj_meta
 
 __all__ = [
     'all_gather',
diff --git a/colossalai/communication/collective.py b/colossalai/legacy/communication/collective.py
similarity index 100%
rename from colossalai/communication/collective.py
rename to colossalai/legacy/communication/collective.py
diff --git a/colossalai/communication/p2p.py b/colossalai/legacy/communication/p2p.py
similarity index 100%
rename from colossalai/communication/p2p.py
rename to colossalai/legacy/communication/p2p.py
diff --git a/colossalai/communication/p2p_v2.py b/colossalai/legacy/communication/p2p_v2.py
similarity index 100%
rename from colossalai/communication/p2p_v2.py
rename to colossalai/legacy/communication/p2p_v2.py
diff --git a/colossalai/communication/ring.py b/colossalai/legacy/communication/ring.py
similarity index 100%
rename from colossalai/communication/ring.py
rename to colossalai/legacy/communication/ring.py
diff --git a/colossalai/communication/utils.py b/colossalai/legacy/communication/utils.py
similarity index 100%
rename from colossalai/communication/utils.py
rename to colossalai/legacy/communication/utils.py
diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
index 88b54ce6a..4571fd679 100644
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union
 
 import torch.cuda
 
-import colossalai.communication as comm
+import colossalai.legacy.communication as comm
 from colossalai.amp.naive_amp import NaiveAMPModel
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
index 9e7372b67..385c61537 100644
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
@@ -5,10 +5,10 @@ from typing import Iterable, Tuple
 
 import torch.cuda
 
-import colossalai.communication.p2p_v2 as comm
-from colossalai import engine
+import colossalai.legacy.communication.p2p_v2 as comm
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.engine import Engine
 from colossalai.utils.cuda import get_current_device
 
 from ._pipeline_schedule import PipelineSchedule
@@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule):
     """
 
     def forward_backward_step(self,
-                              engine: engine.Engine,
+                              engine: Engine,
                               data_iter: Iterable,
                               forward_only=False,
                               return_loss=True,
diff --git a/colossalai/legacy/nn/__init__.py b/colossalai/legacy/nn/__init__.py
new file mode 100644
index 000000000..500162901
--- /dev/null
+++ b/colossalai/legacy/nn/__init__.py
@@ -0,0 +1,4 @@
+from ._ops import *
+from .layer import *
+from .loss import *
+from .metric import *
diff --git a/colossalai/nn/_ops/__init__.py b/colossalai/legacy/nn/_ops/__init__.py
similarity index 100%
rename from colossalai/nn/_ops/__init__.py
rename to colossalai/legacy/nn/_ops/__init__.py
diff --git a/colossalai/nn/_ops/_utils.py b/colossalai/legacy/nn/_ops/_utils.py
similarity index 99%
rename from colossalai/nn/_ops/_utils.py
rename to colossalai/legacy/nn/_ops/_utils.py
index 24877bbb5..131c21547 100644
--- a/colossalai/nn/_ops/_utils.py
+++ b/colossalai/legacy/nn/_ops/_utils.py
@@ -4,7 +4,7 @@ import torch
 import torch.distributed as dist
 
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import divide
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
 
 GeneralTensor = Union[ColoTensor, torch.Tensor]
@@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
     return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
 
 
-### table wise embedding shard
+# table wise embedding shard
 
 
 def _all_to_all_for_tablewise(x: torch.Tensor,
diff --git a/colossalai/nn/_ops/addmm.py b/colossalai/legacy/nn/_ops/addmm.py
similarity index 100%
rename from colossalai/nn/_ops/addmm.py
rename to colossalai/legacy/nn/_ops/addmm.py
diff --git a/colossalai/nn/_ops/batch_norm.py b/colossalai/legacy/nn/_ops/batch_norm.py
similarity index 100%
rename from colossalai/nn/_ops/batch_norm.py
rename to colossalai/legacy/nn/_ops/batch_norm.py
diff --git a/colossalai/nn/_ops/element_wise.py b/colossalai/legacy/nn/_ops/element_wise.py
similarity index 100%
rename from colossalai/nn/_ops/element_wise.py
rename to colossalai/legacy/nn/_ops/element_wise.py
diff --git a/colossalai/nn/_ops/embedding.py b/colossalai/legacy/nn/_ops/embedding.py
similarity index 98%
rename from colossalai/nn/_ops/embedding.py
rename to colossalai/legacy/nn/_ops/embedding.py
index a045f305b..b145d1763 100644
--- a/colossalai/nn/_ops/embedding.py
+++ b/colossalai/legacy/nn/_ops/embedding.py
@@ -1,8 +1,10 @@
-import torch.nn.functional as F
 from typing import Optional
+
+import torch.nn.functional as F
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
-    ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
 
 
diff --git a/colossalai/nn/_ops/embedding_bag.py b/colossalai/legacy/nn/_ops/embedding_bag.py
similarity index 97%
rename from colossalai/nn/_ops/embedding_bag.py
rename to colossalai/legacy/nn/_ops/embedding_bag.py
index 0026f579b..9a656d587 100644
--- a/colossalai/nn/_ops/embedding_bag.py
+++ b/colossalai/legacy/nn/_ops/embedding_bag.py
@@ -1,9 +1,11 @@
-import torch.nn.functional as F
 from typing import Optional
+
+import torch.nn.functional as F
 from torch import Tensor
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
-    ShardSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
 
 
diff --git a/colossalai/nn/_ops/layernorm.py b/colossalai/legacy/nn/_ops/layernorm.py
similarity index 92%
rename from colossalai/nn/_ops/layernorm.py
rename to colossalai/legacy/nn/_ops/layernorm.py
index 2b761b84e..9960c5d48 100644
--- a/colossalai/nn/_ops/layernorm.py
+++ b/colossalai/legacy/nn/_ops/layernorm.py
@@ -1,7 +1,10 @@
 from typing import List, Optional
+
 import torch.nn.functional as F
+
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
 
 
diff --git a/colossalai/nn/_ops/linear.py b/colossalai/legacy/nn/_ops/linear.py
similarity index 100%
rename from colossalai/nn/_ops/linear.py
rename to colossalai/legacy/nn/_ops/linear.py
diff --git a/colossalai/nn/_ops/loss.py b/colossalai/legacy/nn/_ops/loss.py
similarity index 96%
rename from colossalai/nn/_ops/loss.py
rename to colossalai/legacy/nn/_ops/loss.py
index 1e54f6628..90efbfa36 100644
--- a/colossalai/nn/_ops/loss.py
+++ b/colossalai/legacy/nn/_ops/loss.py
@@ -1,9 +1,12 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
-from typing import Optional
-from colossalai.tensor.op_wrapper import colo_op_impl
+
+from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
 from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
+from colossalai.tensor.op_wrapper import colo_op_impl
+
 from ._utils import GeneralTensor, convert_to_colo_tensor
 
 
diff --git a/colossalai/nn/_ops/view.py b/colossalai/legacy/nn/_ops/view.py
similarity index 100%
rename from colossalai/nn/_ops/view.py
rename to colossalai/legacy/nn/_ops/view.py
diff --git a/colossalai/legacy/nn/layer/__init__.py b/colossalai/legacy/nn/layer/__init__.py
new file mode 100644
index 000000000..86961dd93
--- /dev/null
+++ b/colossalai/legacy/nn/layer/__init__.py
@@ -0,0 +1,9 @@
+from .colossalai_layer import *
+from .parallel_1d import *
+from .parallel_2d import *
+from .parallel_2p5d import *
+from .parallel_3d import *
+from .parallel_sequence import *
+from .utils import *
+from .vanilla import *
+from .wrapper import *
diff --git a/colossalai/nn/layer/base_layer.py b/colossalai/legacy/nn/layer/base_layer.py
similarity index 100%
rename from colossalai/nn/layer/base_layer.py
rename to colossalai/legacy/nn/layer/base_layer.py
diff --git a/colossalai/nn/layer/colossalai_layer/__init__.py b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py
similarity index 97%
rename from colossalai/nn/layer/colossalai_layer/__init__.py
rename to colossalai/legacy/nn/layer/colossalai_layer/__init__.py
index 2ae1b07a7..ed743820d 100644
--- a/colossalai/nn/layer/colossalai_layer/__init__.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py
@@ -1,7 +1,7 @@
-from ._utils import partition_batch
-from .dropout import Dropout
-from .embedding import Embedding, PatchEmbedding
-from .linear import Classifier, Linear
-from .normalization import LayerNorm
-
-__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
+from ._utils import partition_batch
+from .dropout import Dropout
+from .embedding import Embedding, PatchEmbedding
+from .linear import Classifier, Linear
+from .normalization import LayerNorm
+
+__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
diff --git a/colossalai/nn/layer/colossalai_layer/_utils.py b/colossalai/legacy/nn/layer/colossalai_layer/_utils.py
similarity index 100%
rename from colossalai/nn/layer/colossalai_layer/_utils.py
rename to colossalai/legacy/nn/layer/colossalai_layer/_utils.py
diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
similarity index 100%
rename from colossalai/nn/layer/colossalai_layer/dropout.py
rename to colossalai/legacy/nn/layer/colossalai_layer/dropout.py
diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
similarity index 97%
rename from colossalai/nn/layer/colossalai_layer/embedding.py
rename to colossalai/legacy/nn/layer/colossalai_layer/embedding.py
index e5c9c46e0..28bcb7ffe 100644
--- a/colossalai/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py
@@ -1,151 +1,152 @@
-import math
-from typing import Callable
-
-from colossalai.utils import get_current_device
-from torch import dtype, nn
-
-from ... import init as init
-from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
-from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
-from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
-from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
-from ..utils import get_tensor_parallel_mode
-from ..vanilla import VanillaPatchEmbedding
-from ._utils import ColossalaiModule
-
-_parallel_embedding = {
-    '1d': Embedding1D,
-    '2d': Embedding2D,
-    '2.5d': Embedding2p5D,
-    '3d': Embedding3D,
-}
-
-_vocab_parallel_embedding = {
-    '1d': VocabParallelEmbedding1D,
-    '2d': VocabParallelEmbedding2D,
-    '2.5d': VocabParallelEmbedding2p5D,
-    '3d': VocabParallelEmbedding3D
-}
-
-_parallel_patchembedding = {
-    None: VanillaPatchEmbedding,
-    '1d': PatchEmbedding1D,
-    '2d': PatchEmbedding2D,
-    '2.5d': PatchEmbedding2p5D,
-    '3d': PatchEmbedding3D
-}
-
-
-class Embedding(ColossalaiModule):
-    r"""Embedding for colossalai.
-
-    Args:
-        num_embeddings (int): number of embeddings.
-        embedding_dim (int): dimension of embedding.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            therefore, the embedding vector at padding_idx is not updated during training,
-            i.e. it remains as a fixed “pad”, defaults to None.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
-            he initializer of weight, defaults to normal initializer.
-
-    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    ::
-
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-                    of frequency of the words in the mini-batch. Default False.
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-
-    More details about ``args`` and ``kwargs`` could be found in
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
-    """
-
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 weight_initializer: Callable = init.normal_(),
-                 vocab_parallel_limit: int = 2048,
-                 *args,
-                 **kwargs) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is None:
-            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
-                                 **kwargs).to(dtype).to(get_current_device())
-            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
-        elif num_embeddings <= vocab_parallel_limit:
-            embed = _parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-        else:
-            embed = _vocab_parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-        super().__init__(embed)
-
-
-class PatchEmbedding(ColossalaiModule):
-    """2D Image to Patch Embedding.
-
-    Args:
-        img_size (int): image size.
-        patch_size (int): patch size.
-        in_chans (int): number of channels of input image.
-        embed_size (int): size of embedding.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-        weight_initializer (:class:`typing.Callable`, optional):
-            The initializer of weight, defaults to kaiming uniform initializer.
-        bias_initializer (:class:`typing.Callable`, optional):
-            The initializer of bias, defaults to xavier uniform initializer.
-        position_embed_initializer (:class:`typing.Callable`, optional):
-            The initializer of position embedding, defaults to zeros initializer.
-
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
-    """
-
-    def __init__(
-        self,
-        img_size: int,
-        patch_size: int,
-        in_chans: int,
-        embed_size: int,
-        dtype: dtype = None,
-        flatten: bool = True,
-        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-        position_embed_initializer: Callable = init.zeros_()
-    ) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        embed = _parallel_patchembedding[tensor_parallel](
-            img_size,
-            patch_size,
-            in_chans,
-            embed_size,
-            dtype=dtype,
-            flatten=flatten,
-            weight_initializer=weight_initializer,
-            bias_initializer=bias_initializer,
-            position_embed_initializer=position_embed_initializer,
-        )
-        super().__init__(embed)
+import math
+from typing import Callable
+
+from torch import dtype, nn
+
+from colossalai.nn import init
+from colossalai.utils import get_current_device
+
+from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
+from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
+from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
+from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import VanillaPatchEmbedding
+from ._utils import ColossalaiModule
+
+_parallel_embedding = {
+    '1d': Embedding1D,
+    '2d': Embedding2D,
+    '2.5d': Embedding2p5D,
+    '3d': Embedding3D,
+}
+
+_vocab_parallel_embedding = {
+    '1d': VocabParallelEmbedding1D,
+    '2d': VocabParallelEmbedding2D,
+    '2.5d': VocabParallelEmbedding2p5D,
+    '3d': VocabParallelEmbedding3D
+}
+
+_parallel_patchembedding = {
+    None: VanillaPatchEmbedding,
+    '1d': PatchEmbedding1D,
+    '2d': PatchEmbedding2D,
+    '2.5d': PatchEmbedding2p5D,
+    '3d': PatchEmbedding3D
+}
+
+
+class Embedding(ColossalaiModule):
+    r"""Embedding for colossalai.
+
+    Args:
+        num_embeddings (int): number of embeddings.
+        embedding_dim (int): dimension of embedding.
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+            therefore, the embedding vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”, defaults to None.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
+
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
+
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    """
+
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 vocab_parallel_limit: int = 2048,
+                 *args,
+                 **kwargs) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is None:
+            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
+                                 **kwargs).to(dtype).to(get_current_device())
+            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
+        elif num_embeddings <= vocab_parallel_limit:
+            embed = _parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+        else:
+            embed = _vocab_parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+        super().__init__(embed)
+
+
+class PatchEmbedding(ColossalaiModule):
+    """2D Image to Patch Embedding.
+
+    Args:
+        img_size (int): image size.
+        patch_size (int): patch size.
+        in_chans (int): number of channels of input image.
+        embed_size (int): size of embedding.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        embed_size: int,
+        dtype: dtype = None,
+        flatten: bool = True,
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        position_embed_initializer: Callable = init.zeros_()
+    ) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        embed = _parallel_patchembedding[tensor_parallel](
+            img_size,
+            patch_size,
+            in_chans,
+            embed_size,
+            dtype=dtype,
+            flatten=flatten,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            position_embed_initializer=position_embed_initializer,
+        )
+        super().__init__(embed)
diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/legacy/nn/layer/colossalai_layer/linear.py
similarity index 99%
rename from colossalai/nn/layer/colossalai_layer/linear.py
rename to colossalai/legacy/nn/layer/colossalai_layer/linear.py
index 3e0c6e285..c05ceb66c 100644
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/linear.py
@@ -4,9 +4,9 @@ from typing import Callable
 
 from torch import dtype, nn
 
+from colossalai.nn import init
 from colossalai.utils import get_current_device
 
-from ... import init as init
 from ..parallel_1d import *
 from ..parallel_2d import *
 from ..parallel_2p5d import *
diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
similarity index 97%
rename from colossalai/nn/layer/colossalai_layer/normalization.py
rename to colossalai/legacy/nn/layer/colossalai_layer/normalization.py
index 86861d302..f8e317e72 100644
--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py
@@ -1,41 +1,42 @@
-from colossalai.utils import get_current_device
-from torch import nn
-
-from ..parallel_1d import LayerNorm1D
-from ..parallel_2d import LayerNorm2D
-from ..parallel_2p5d import LayerNorm2p5D
-from ..parallel_3d import LayerNorm3D
-from ..utils import get_tensor_parallel_mode
-from ..vanilla import VanillaLayerNorm
-from ._utils import ColossalaiModule
-
-_parallel_layernorm = {
-    None: VanillaLayerNorm,
-    "1d": LayerNorm1D,
-    "2d": LayerNorm2D,
-    "2.5d": LayerNorm2p5D,
-    "3d": LayerNorm3D,
-}
-
-
-class LayerNorm(ColossalaiModule):
-    r"""Layer Normalization for colossalai.
-
-    Args:
-        normalized_shape (int): input shape from an expected input of size.
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-            \times \ldots \times \text{normalized_shape}[-1]]`
-            If a single integer is used, it is treated as a singleton list, and this module will
-            normalize over the last dimension which is expected to be of that specific size.
-        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
-        bias (bool, optional): Whether to add a bias, defaults to ``True``.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-    """
-
-    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is None:
-            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
-        else:
-            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
-        super().__init__(norm)
+from torch import nn
+
+from colossalai.utils import get_current_device
+
+from ..parallel_1d import LayerNorm1D
+from ..parallel_2d import LayerNorm2D
+from ..parallel_2p5d import LayerNorm2p5D
+from ..parallel_3d import LayerNorm3D
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import VanillaLayerNorm
+from ._utils import ColossalaiModule
+
+_parallel_layernorm = {
+    None: VanillaLayerNorm,
+    "1d": LayerNorm1D,
+    "2d": LayerNorm2D,
+    "2.5d": LayerNorm2p5D,
+    "3d": LayerNorm3D,
+}
+
+
+class LayerNorm(ColossalaiModule):
+    r"""Layer Normalization for colossalai.
+
+    Args:
+        normalized_shape (int): input shape from an expected input of size.
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+            \times \ldots \times \text{normalized_shape}[-1]]`
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
+        bias (bool, optional): Whether to add a bias, defaults to ``True``.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+    """
+
+    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is None:
+            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
+        else:
+            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+        super().__init__(norm)
diff --git a/colossalai/legacy/nn/layer/parallel_1d/__init__.py b/colossalai/legacy/nn/layer/parallel_1d/__init__.py
new file mode 100644
index 000000000..9cffd4d33
--- /dev/null
+++ b/colossalai/legacy/nn/layer/parallel_1d/__init__.py
@@ -0,0 +1,17 @@
+from .layers import (
+    Classifier1D,
+    Dropout1D,
+    Embedding1D,
+    LayerNorm1D,
+    Linear1D,
+    Linear1D_Col,
+    Linear1D_Row,
+    PatchEmbedding1D,
+    VocabParallelClassifier1D,
+    VocabParallelEmbedding1D,
+)
+
+__all__ = [
+    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
+    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
+]
diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
similarity index 100%
rename from colossalai/nn/layer/parallel_1d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_1d/_operation.py
diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
similarity index 99%
rename from colossalai/nn/layer/parallel_1d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_1d/_utils.py
index 1212d5956..fddf4e73d 100644
--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.distributed as dist
+
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 
@@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
 class _SplitForwardGatherBackward(torch.autograd.Function):
     """
     Split the input and keep only the corresponding chuck to the rank.
-    
+
     Args:
         input_: input matrix.
         parallel_mode: parallel mode.
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/legacy/nn/layer/parallel_1d/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_1d/layers.py
rename to colossalai/legacy/nn/layer/parallel_1d/layers.py
index 7b129009e..c0a169c15 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/layers.py
@@ -10,11 +10,11 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn.parameter import Parameter
 
-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (
diff --git a/colossalai/nn/layer/parallel_2d/__init__.py b/colossalai/legacy/nn/layer/parallel_2d/__init__.py
similarity index 59%
rename from colossalai/nn/layer/parallel_2d/__init__.py
rename to colossalai/legacy/nn/layer/parallel_2d/__init__.py
index 5562d1a70..9c65f3608 100644
--- a/colossalai/nn/layer/parallel_2d/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/__init__.py
@@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2d, split_batch_2d
-from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
-                     VocabParallelEmbedding2D)
+from .layers import (
+    Classifier2D,
+    Embedding2D,
+    LayerNorm2D,
+    Linear2D,
+    PatchEmbedding2D,
+    VocabParallelClassifier2D,
+    VocabParallelEmbedding2D,
+)
 
 __all__ = [
     'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
similarity index 98%
rename from colossalai/nn/layer/parallel_2d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_2d/_operation.py
index 306577dbd..fa9b49bcf 100644
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
 
 import torch
 import torch.distributed as dist
-from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
+from colossalai.utils import get_current_device
 
 
 def matmul_2d(
@@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
         col_group = gpc.get_group(col_parallel_mode)
 
         src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
         src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
 
         opa = [None] * 2
         opb = [None] * 2
@@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
         col_group = gpc.get_group(col_parallel_mode)
 
         src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
         src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
 
         opb = [None] * 2
         opr = [None] * 2
@@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
         col_group = gpc.get_group(col_parallel_mode)
 
         src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
         src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
 
         opa = [None] * 2
         opr = [None] * 2
diff --git a/colossalai/nn/layer/parallel_2d/_utils.py b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_2d/_utils.py
diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/legacy/nn/layer/parallel_2d/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_2d/layers.py
rename to colossalai/legacy/nn/layer/parallel_2d/layers.py
index 1a01d5437..b458d15c7 100644
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/layers.py
@@ -8,10 +8,10 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 
-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
diff --git a/colossalai/nn/layer/parallel_2p5d/__init__.py b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py
similarity index 59%
rename from colossalai/nn/layer/parallel_2p5d/__init__.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/__init__.py
index bec3b1c4b..23e47e6ed 100644
--- a/colossalai/nn/layer/parallel_2p5d/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py
@@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
-from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
-                     VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
+from .layers import (
+    Classifier2p5D,
+    Embedding2p5D,
+    LayerNorm2p5D,
+    Linear2p5D,
+    PatchEmbedding2p5D,
+    VocabParallelClassifier2p5D,
+    VocabParallelEmbedding2p5D,
+)
 
 __all__ = [
     'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
similarity index 99%
rename from colossalai/nn/layer/parallel_2p5d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
index 5a0f537cd..55defa4a3 100644
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@@ -2,13 +2,14 @@ from typing import Any, Tuple
 
 import torch
 import torch.distributed as dist
-from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
+from colossalai.utils import get_current_device
+
 
 def get_parallel_group(parallel_mode: ParallelMode):
     return gpc.get_group(parallel_mode)
diff --git a/colossalai/nn/layer/parallel_2p5d/_utils.py b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_2p5d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_2p5d/layers.py
rename to colossalai/legacy/nn/layer/parallel_2p5d/layers.py
index 62c4292fd..04acc2bb0 100644
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
@@ -8,10 +8,10 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 
-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (
diff --git a/colossalai/nn/layer/parallel_3d/__init__.py b/colossalai/legacy/nn/layer/parallel_3d/__init__.py
similarity index 62%
rename from colossalai/nn/layer/parallel_3d/__init__.py
rename to colossalai/legacy/nn/layer/parallel_3d/__init__.py
index 9ae255b44..17fe8403c 100644
--- a/colossalai/nn/layer/parallel_3d/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/__init__.py
@@ -1,6 +1,13 @@
 from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
-from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
-                     VocabParallelEmbedding3D)
+from .layers import (
+    Classifier3D,
+    Embedding3D,
+    LayerNorm3D,
+    Linear3D,
+    PatchEmbedding3D,
+    VocabParallelClassifier3D,
+    VocabParallelEmbedding3D,
+)
 
 __all__ = [
     'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
diff --git a/colossalai/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
similarity index 99%
rename from colossalai/nn/layer/parallel_3d/_operation.py
rename to colossalai/legacy/nn/layer/parallel_3d/_operation.py
index 5dc9a2428..ca0b0e627 100755
--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@@ -7,10 +7,10 @@ import torch
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 
-from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
 
 from ._utils import get_parallel_mode_from_env, push_async_grad
 
diff --git a/colossalai/nn/layer/parallel_3d/_utils.py b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_3d/_utils.py
rename to colossalai/legacy/nn/layer/parallel_3d/_utils.py
diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/legacy/nn/layer/parallel_3d/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_3d/layers.py
rename to colossalai/legacy/nn/layer/parallel_3d/layers.py
index 7d940aa27..b815a842c 100644
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py
@@ -8,14 +8,14 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 
-from colossalai.communication import all_reduce, broadcast
 from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication import all_reduce, broadcast
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.nn.layer.base_layer import ParallelLayer
 from colossalai.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
diff --git a/colossalai/nn/layer/parallel_sequence/__init__.py b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py
similarity index 74%
rename from colossalai/nn/layer/parallel_sequence/__init__.py
rename to colossalai/legacy/nn/layer/parallel_sequence/__init__.py
index 4fa9eed6f..d92d66d40 100644
--- a/colossalai/nn/layer/parallel_sequence/__init__.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py
@@ -1,4 +1,4 @@
-from ._operation import RingQK, RingAV
+from ._operation import RingAV, RingQK
 from .layers import TransformerSelfAttentionRing
 
 __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
diff --git a/colossalai/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
similarity index 97%
rename from colossalai/nn/layer/parallel_sequence/_operation.py
rename to colossalai/legacy/nn/layer/parallel_sequence/_operation.py
index fc8049422..fcf296201 100644
--- a/colossalai/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@@ -3,13 +3,13 @@
 
 import torch
 from torch import distributed as dist
+from torch.cuda.amp import custom_bwd, custom_fwd
 
-from colossalai.communication import ring_forward
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range
+from colossalai.legacy.communication import ring_forward
+from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
 from colossalai.utils import get_current_device
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 
 class RingQK(torch.autograd.Function):
diff --git a/colossalai/nn/layer/parallel_sequence/_utils.py b/colossalai/legacy/nn/layer/parallel_sequence/_utils.py
similarity index 100%
rename from colossalai/nn/layer/parallel_sequence/_utils.py
rename to colossalai/legacy/nn/layer/parallel_sequence/_utils.py
diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
similarity index 99%
rename from colossalai/nn/layer/parallel_sequence/layers.py
rename to colossalai/legacy/nn/layer/parallel_sequence/layers.py
index 4d0ff2e06..e44e61c2f 100644
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.kernel import FusedScaleMaskSoftmax
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
 
 
 @LAYERS.register_module
diff --git a/colossalai/legacy/nn/layer/utils/__init__.py b/colossalai/legacy/nn/layer/utils/__init__.py
new file mode 100644
index 000000000..56e969bfd
--- /dev/null
+++ b/colossalai/legacy/nn/layer/utils/__init__.py
@@ -0,0 +1,15 @@
+from .common import (
+    ACT2FN,
+    CheckpointModule,
+    _ntuple,
+    divide,
+    get_tensor_parallel_mode,
+    set_tensor_parallel_attribute_by_partition,
+    set_tensor_parallel_attribute_by_size,
+    to_2tuple,
+)
+
+__all__ = [
+    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
+    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
+]
diff --git a/colossalai/nn/layer/utils/common.py b/colossalai/legacy/nn/layer/utils/common.py
similarity index 99%
rename from colossalai/nn/layer/utils/common.py
rename to colossalai/legacy/nn/layer/utils/common.py
index f2297304f..d8f3ad2a7 100644
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/legacy/nn/layer/utils/common.py
@@ -6,10 +6,11 @@ from itertools import repeat
 
 import numpy as np
 import torch
+from torch import Tensor, nn
+
 from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.utils import checkpoint
-from torch import Tensor, nn
 
 
 class CheckpointModule(nn.Module):
diff --git a/colossalai/nn/layer/vanilla/__init__.py b/colossalai/legacy/nn/layer/vanilla/__init__.py
similarity index 100%
rename from colossalai/nn/layer/vanilla/__init__.py
rename to colossalai/legacy/nn/layer/vanilla/__init__.py
diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/legacy/nn/layer/vanilla/layers.py
similarity index 100%
rename from colossalai/nn/layer/vanilla/layers.py
rename to colossalai/legacy/nn/layer/vanilla/layers.py
diff --git a/colossalai/nn/layer/wrapper/__init__.py b/colossalai/legacy/nn/layer/wrapper/__init__.py
similarity index 100%
rename from colossalai/nn/layer/wrapper/__init__.py
rename to colossalai/legacy/nn/layer/wrapper/__init__.py
diff --git a/colossalai/nn/layer/wrapper/pipeline_wrapper.py b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
similarity index 99%
rename from colossalai/nn/layer/wrapper/pipeline_wrapper.py
rename to colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
index ef1d794cc..68fea8622 100644
--- a/colossalai/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
@@ -1,6 +1,8 @@
-import torch.nn as nn
-import torch.distributed as dist
 from typing import List, Tuple, Union
+
+import torch.distributed as dist
+import torch.nn as nn
+
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 
diff --git a/colossalai/legacy/nn/loss/__init__.py b/colossalai/legacy/nn/loss/__init__.py
new file mode 100644
index 000000000..1bd8872d9
--- /dev/null
+++ b/colossalai/legacy/nn/loss/__init__.py
@@ -0,0 +1,41 @@
+from torch import nn
+from torch.nn.modules.loss import *
+from torch.nn.modules.loss import _Loss
+
+from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from .loss_1d import VocabParallelCrossEntropyLoss1D
+from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
+from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
+from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
+
+_parallel_cross_entropy = {
+    '2d': CrossEntropyLoss2D,
+    '2.5d': CrossEntropyLoss2p5D,
+    '3d': CrossEntropyLoss3D,
+}
+
+_vocab_parallel_cross_entropy = {
+    '1d': VocabParallelCrossEntropyLoss1D,
+    '2d': VocabParallelCrossEntropyLoss2D,
+    '2.5d': VocabParallelCrossEntropyLoss2p5D,
+    '3d': VocabParallelCrossEntropyLoss3D,
+}
+
+
+class CrossEntropyLoss(_Loss):
+
+    def __init__(self, reduction: bool = True, *args, **kwargs):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel is not None and env.vocab_parallel:
+            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+        elif tensor_parallel is None or tensor_parallel == '1d':
+            reduction = 'mean' if reduction else 'none'
+            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
+        else:
+            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+
+    def forward(self, *args):
+        return self.loss(*args)
diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py
similarity index 100%
rename from colossalai/nn/loss/loss_1d.py
rename to colossalai/legacy/nn/loss/loss_1d.py
diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py
similarity index 97%
rename from colossalai/nn/loss/loss_2d.py
rename to colossalai/legacy/nn/loss/loss_2d.py
index 6db40c0f3..6191602b7 100644
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
-from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py
similarity index 96%
rename from colossalai/nn/loss/loss_2p5d.py
rename to colossalai/legacy/nn/loss/loss_2p5d.py
index 9c78a1ef0..2746b2011 100644
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
-from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py
similarity index 97%
rename from colossalai/nn/loss/loss_3d.py
rename to colossalai/legacy/nn/loss/loss_3d.py
index 5c0f26640..2aeb1bd98 100644
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
 
 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.legacy.registry import LOSSES
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.utils import get_current_device
 
 
diff --git a/colossalai/nn/metric/__init__.py b/colossalai/legacy/nn/metric/__init__.py
similarity index 87%
rename from colossalai/nn/metric/__init__.py
rename to colossalai/legacy/nn/metric/__init__.py
index 00833b611..76c6dac89 100644
--- a/colossalai/nn/metric/__init__.py
+++ b/colossalai/legacy/nn/metric/__init__.py
@@ -1,26 +1,28 @@
-from torch import nn
-
-from ._utils import calc_acc
-from .accuracy_2d import Accuracy2D
-from .accuracy_2p5d import Accuracy2p5D
-from .accuracy_3d import Accuracy3D
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-
-_parallel_accuracy = {
-    '2d': Accuracy2D,
-    '2.5d': Accuracy2p5D,
-    '3d': Accuracy3D,
-}
-
-
-class Accuracy(nn.Module):
-    def __init__(self):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel not in _parallel_accuracy:
-            self.acc = calc_acc
-        else:
-            self.acc = _parallel_accuracy[tensor_parallel]()
-
-    def forward(self, *args):
-        return self.acc(*args)
+from torch import nn
+
+from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
+
+from ._utils import calc_acc
+from .accuracy_2d import Accuracy2D
+from .accuracy_2p5d import Accuracy2p5D
+from .accuracy_3d import Accuracy3D
+
+_parallel_accuracy = {
+    '2d': Accuracy2D,
+    '2.5d': Accuracy2p5D,
+    '3d': Accuracy3D,
+}
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel not in _parallel_accuracy:
+            self.acc = calc_acc
+        else:
+            self.acc = _parallel_accuracy[tensor_parallel]()
+
+    def forward(self, *args):
+        return self.acc(*args)
diff --git a/colossalai/nn/metric/_utils.py b/colossalai/legacy/nn/metric/_utils.py
similarity index 95%
rename from colossalai/nn/metric/_utils.py
rename to colossalai/legacy/nn/metric/_utils.py
index eac591b64..8706ffc10 100644
--- a/colossalai/nn/metric/_utils.py
+++ b/colossalai/legacy/nn/metric/_utils.py
@@ -1,7 +1,7 @@
-import torch
-
-
-def calc_acc(logits, targets):
-    preds = torch.argmax(logits, dim=-1)
-    correct = torch.sum(targets == preds)
-    return correct
+import torch
+
+
+def calc_acc(logits, targets):
+    preds = torch.argmax(logits, dim=-1)
+    correct = torch.sum(targets == preds)
+    return correct
diff --git a/colossalai/nn/metric/accuracy_2d.py b/colossalai/legacy/nn/metric/accuracy_2d.py
similarity index 89%
rename from colossalai/nn/metric/accuracy_2d.py
rename to colossalai/legacy/nn/metric/accuracy_2d.py
index a86832973..838c48834 100644
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2d.py
@@ -1,7 +1,8 @@
 import torch
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from torch import nn
 
+from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+
 from ._utils import calc_acc
 
 
diff --git a/colossalai/nn/metric/accuracy_2p5d.py b/colossalai/legacy/nn/metric/accuracy_2p5d.py
similarity index 88%
rename from colossalai/nn/metric/accuracy_2p5d.py
rename to colossalai/legacy/nn/metric/accuracy_2p5d.py
index 3044da065..183380cd9 100644
--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/legacy/nn/metric/accuracy_2p5d.py
@@ -1,7 +1,8 @@
 import torch
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from torch import nn
 
+from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+
 from ._utils import calc_acc
 
 
diff --git a/colossalai/nn/metric/accuracy_3d.py b/colossalai/legacy/nn/metric/accuracy_3d.py
similarity index 85%
rename from colossalai/nn/metric/accuracy_3d.py
rename to colossalai/legacy/nn/metric/accuracy_3d.py
index 5506fc1d2..1aaac73ec 100644
--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/legacy/nn/metric/accuracy_3d.py
@@ -1,33 +1,35 @@
-import torch
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
-from torch import nn
-
-from ._utils import calc_acc
-
-
-class Accuracy3D(nn.Module):
-    """Accuracy for 3D parallelism
-    """
-    def __init__(self):
-        super().__init__()
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-
-    def forward(self, logits, targets):
-        """Calculate the accuracy of predicted labels.
-
-        Args:
-            logits (:class:`torch.tensor`): Predicted labels.
-            targets (:class:`torch.tensor`): True labels from data.
-
-        Returns:
-            float: the accuracy of prediction.
-         """
-        with torch.no_grad():
-            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
-            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
-            correct = calc_acc(logits, targets)
-            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
-        return correct
+import torch
+from torch import nn
+
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+
+from ._utils import calc_acc
+
+
+class Accuracy3D(nn.Module):
+    """Accuracy for 3D parallelism
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+
+    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        Args:
+            logits (:class:`torch.tensor`): Predicted labels.
+            targets (:class:`torch.tensor`): True labels from data.
+
+        Returns:
+            float: the accuracy of prediction.
+         """
+        with torch.no_grad():
+            targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
+            targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
+            correct = calc_acc(logits, targets)
+            correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
+        return correct
diff --git a/colossalai/nn/parallel/__init__.py b/colossalai/legacy/nn/parallel/__init__.py
similarity index 100%
rename from colossalai/nn/parallel/__init__.py
rename to colossalai/legacy/nn/parallel/__init__.py
diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/legacy/nn/parallel/data_parallel.py
similarity index 100%
rename from colossalai/nn/parallel/data_parallel.py
rename to colossalai/legacy/nn/parallel/data_parallel.py
diff --git a/colossalai/nn/parallel/layers/__init__.py b/colossalai/legacy/nn/parallel/layers/__init__.py
similarity index 56%
rename from colossalai/nn/parallel/layers/__init__.py
rename to colossalai/legacy/nn/parallel/layers/__init__.py
index 29b8353e6..f38124efe 100644
--- a/colossalai/nn/parallel/layers/__init__.py
+++ b/colossalai/legacy/nn/parallel/layers/__init__.py
@@ -1,10 +1,17 @@
+from .cache_embedding import (
+    CachedEmbeddingBag,
+    CachedParamMgr,
+    EvictionStrategy,
+    LimitBuffIndexCopyer,
+    ParallelCachedEmbeddingBag,
+    ParallelCachedEmbeddingBagTablewise,
+    ParallelCachedEmbeddingBagTablewiseSpiltCache,
+    TablewiseEmbeddingBagConfig,
+)
 from .colo_module import ColoModule
-from .linear import ColoLinear
 from .embedding import ColoEmbedding
-from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
-
-from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
-    ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
+from .linear import ColoLinear
+from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
 
 __all__ = [
     'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
diff --git a/colossalai/nn/parallel/layers/cache_embedding/__init__.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py
similarity index 100%
rename from colossalai/nn/parallel/layers/cache_embedding/__init__.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py
index 5bbc931a7..d87930c1c 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/__init__.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py
@@ -1,8 +1,8 @@
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from .copyer import LimitBuffIndexCopyer
 from .cached_embedding import CachedEmbeddingBag
-from .parallel_cached_embedding import ParallelCachedEmbeddingBag
+from .copyer import LimitBuffIndexCopyer
 from .embedding_config import TablewiseEmbeddingBagConfig
+from .parallel_cached_embedding import ParallelCachedEmbeddingBag
 from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
 from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/base_embedding.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
index 705835a0e..9558c541e 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py
@@ -1,4 +1,5 @@
 import abc
+
 import torch.nn as nn
 
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
index a6159856d..16530c4ce 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py
@@ -1,12 +1,14 @@
-import numpy as np
-import torch
-from torch.profiler import record_function
-from typing import List, Optional
-from contexttimer import Timer
-from .copyer import LimitBuffIndexCopyer
-from enum import Enum
 import sys
 from contextlib import contextmanager
+from enum import Enum
+from typing import List, Optional
+
+import numpy as np
+import torch
+from contexttimer import Timer
+from torch.profiler import record_function
+
+from .copyer import LimitBuffIndexCopyer
 
 
 class EvictionStrategy(Enum):
@@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
 class CachedParamMgr(torch.nn.Module):
     """
     Manage Embedding Weights on CPU and CUDA memory uses a software cache.
-    CPU maintains the entire original weight. 
+    CPU maintains the entire original weight.
     CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
     During training, GPU needs to transmit embedding rows between CPU and GPU.
     Args:
@@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
         self._elapsed_dict[name] += t.elapsed
 
     def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
-        """_find_evict_gpu_idxs 
+        """_find_evict_gpu_idxs
         Find the gpu idxs to be evicted, according to their freq.
         Args:
             evict_num (int): how many rows has to be evicted
@@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
         """reorder
         reorder the weight according to ids' frequency in dataset before training.
         Execute only once before training, also known as warmup phase.
-        
+
         Note:
             If you would like to use the DATASET as the eviction strategy, you must call this function.
         Note:
@@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
         """
         deprecated
         evict one row from cuda to cpu.
-        Returns: 
+        Returns:
         (int) : the slot id be evicted.
         """
         mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
diff --git a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
similarity index 98%
rename from colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
index a74cb8d94..bc7d17890 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py
@@ -1,10 +1,11 @@
+from typing import Iterator, List, Optional, Tuple, Union
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple, Union
+from torch.nn.parameter import Parameter
 
 from .base_embedding import BaseEmbeddingBag
 from .cache_mgr import CachedParamMgr, EvictionStrategy
-from torch.nn.parameter import Parameter
 
 
 class CachedEmbeddingBag(BaseEmbeddingBag):
@@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
         include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
         dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
         device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
-        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row 
+        cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
         ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
         warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
         buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
@@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
                     buffer_size=50_000,
                     pin_weight=False):
         """
-        Called after initialized. 
+        Called after initialized.
         Reorder the weight rows according to the ids_freq_mapping.
         Then, let the weights of the Module be managed by a CachedParamMgr.
-        
+
         Args:
             cuda_row_num (int): number of rows can be hosted in CUDA memory
             ids_freq_mapping (List[int]): a list, idx is id number, value is freq
diff --git a/colossalai/nn/parallel/layers/cache_embedding/copyer.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
similarity index 97%
rename from colossalai/nn/parallel/layers/cache_embedding/copyer.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
index aa1f79448..804a07f88 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py
@@ -3,7 +3,7 @@ from torch import LongTensor
 
 
 class LimitBuffIndexCopyer(object):
-    """LimitBuffIndexCopyer 
+    """LimitBuffIndexCopyer
     Index Copy using limited temp buffer on CUDA.
 
     Args:
@@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
 
     @torch.no_grad()
     def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
-        """copy 
+        """copy
         src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
         The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
 
diff --git a/colossalai/nn/parallel/layers/cache_embedding/embedding_config.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
similarity index 100%
rename from colossalai/nn/parallel/layers/cache_embedding/embedding_config.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
similarity index 96%
rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
index d7f77e195..79d7672b2 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
@@ -1,12 +1,13 @@
+from typing import Iterator, List, Optional, Tuple
+
 import torch
 import torch.nn.functional as F
-from typing import List, Optional, Iterator, Tuple
 
-from .cached_embedding import CachedEmbeddingBag
-from colossalai.nn._ops._utils import dual_all_to_all
+from colossalai.legacy.nn._ops._utils import dual_all_to_all
+from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
 
-from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
 from .cache_mgr import CachedParamMgr, EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
 
 
 def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
index 949f85ad4..116d836b7 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
@@ -1,15 +1,16 @@
+import time
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 
-from .cached_embedding import CachedEmbeddingBag
-from .cache_mgr import EvictionStrategy
-from .embedding_config import TablewiseEmbeddingBagConfig
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise
 
-from typing import List
-import time
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig
 
 
 class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
similarity index 99%
rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
index 80a54b4fa..0014c784f 100644
--- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
@@ -1,17 +1,17 @@
+import abc
+from typing import List
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.profiler import record_function
 
-from .cached_embedding import CachedEmbeddingBag
-
+from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
 from colossalai.tensor import ProcessGroup
-from colossalai.nn._ops._utils import dual_all_to_all_tablewise
-from .embedding_config import TablewiseEmbeddingBagConfig
-from .cache_mgr import EvictionStrategy
 
-from typing import List
-import abc
+from .cache_mgr import EvictionStrategy
+from .cached_embedding import CachedEmbeddingBag
+from .embedding_config import TablewiseEmbeddingBagConfig
 
 
 class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
diff --git a/colossalai/nn/parallel/layers/colo_module.py b/colossalai/legacy/nn/parallel/layers/colo_module.py
similarity index 98%
rename from colossalai/nn/parallel/layers/colo_module.py
rename to colossalai/legacy/nn/parallel/layers/colo_module.py
index 8f0f5d5f5..a0a3eb40c 100644
--- a/colossalai/nn/parallel/layers/colo_module.py
+++ b/colossalai/legacy/nn/parallel/layers/colo_module.py
@@ -1,6 +1,7 @@
-from colossalai.tensor.distspec import _DistSpec
+from typing import Dict, List
+
 from colossalai.tensor import ComputePattern
-from typing import List, Dict
+from colossalai.tensor.distspec import _DistSpec
 
 
 class ColoModule(object):
diff --git a/colossalai/nn/parallel/layers/embedding.py b/colossalai/legacy/nn/parallel/layers/embedding.py
similarity index 92%
rename from colossalai/nn/parallel/layers/embedding.py
rename to colossalai/legacy/nn/parallel/layers/embedding.py
index ccacc1ead..3e4e7ffd8 100644
--- a/colossalai/nn/parallel/layers/embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/embedding.py
@@ -1,5 +1,6 @@
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
 
 
 class ColoEmbedding(ColoModule):
diff --git a/colossalai/nn/parallel/layers/linear.py b/colossalai/legacy/nn/parallel/layers/linear.py
similarity index 93%
rename from colossalai/nn/parallel/layers/linear.py
rename to colossalai/legacy/nn/parallel/layers/linear.py
index 84a8c0425..e391cf808 100644
--- a/colossalai/nn/parallel/layers/linear.py
+++ b/colossalai/legacy/nn/parallel/layers/linear.py
@@ -1,5 +1,6 @@
+from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+
 from .colo_module import ColoModule
-from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
 
 
 class ColoLinear(ColoModule):
diff --git a/colossalai/nn/parallel/layers/module_utils.py b/colossalai/legacy/nn/parallel/layers/module_utils.py
similarity index 99%
rename from colossalai/nn/parallel/layers/module_utils.py
rename to colossalai/legacy/nn/parallel/layers/module_utils.py
index 38d128cc7..191266fa7 100644
--- a/colossalai/nn/parallel/layers/module_utils.py
+++ b/colossalai/legacy/nn/parallel/layers/module_utils.py
@@ -1,9 +1,11 @@
 from typing import Dict
-from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
-from colossalai.tensor import distspec
-from . import ColoModule
+
 import torch
 
+from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
+
+from . import ColoModule
+
 _COLOSSAL_MODULES: Dict[type, ColoModule] = {}
 
 
diff --git a/colossalai/nn/parallel/reducer.py b/colossalai/legacy/nn/parallel/reducer.py
similarity index 100%
rename from colossalai/nn/parallel/reducer.py
rename to colossalai/legacy/nn/parallel/reducer.py
diff --git a/colossalai/legacy/trainer/hooks/_metric_hook.py b/colossalai/legacy/trainer/hooks/_metric_hook.py
index d0598c240..f1bd19387 100644
--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@@ -7,9 +7,9 @@ from typing import Callable
 import torch
 import torch.distributed as dist
 
-from colossalai.communication import all_reduce
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.communication import all_reduce
 from colossalai.legacy.registry import HOOKS
 from colossalai.utils import get_current_device, is_no_pp_or_last_stage
 
diff --git a/colossalai/logging/logger.py b/colossalai/logging/logger.py
index af7b7de54..f9abe4a2a 100644
--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@@ -6,8 +6,7 @@ import logging
 from pathlib import Path
 from typing import List, Union
 
-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
+import torch.distributed as dist
 
 
 class DistributedLogger:
@@ -63,6 +62,7 @@ class DistributedLogger:
             self._logger.propagate = False
 
             DistributedLogger.__instances[name] = self
+        self.rank = dist.get_rank() if dist.is_initialized() else 0
 
     @staticmethod
     def __get_call_info():
@@ -109,16 +109,10 @@ class DistributedLogger:
         # create log directory
         path.mkdir(parents=True, exist_ok=True)
 
-        # set the default file name if path is a directory
-        if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
-            rank = 0
-        else:
-            rank = colossalai.core.global_context.get_global_rank()
-
         if suffix is not None:
-            log_file_name = f'rank_{rank}_{suffix}.log'
+            log_file_name = f'rank_{self.rank}_{suffix}.log'
         else:
-            log_file_name = f'rank_{rank}.log'
+            log_file_name = f'rank_{self.rank}.log'
         path = path.joinpath(log_file_name)
 
         # add file handler
@@ -128,19 +122,14 @@ class DistributedLogger:
         file_handler.setFormatter(formatter)
         self._logger.addHandler(file_handler)
 
-    def _log(self,
-             level,
-             message: str,
-             parallel_mode: ParallelMode = ParallelMode.GLOBAL,
-             ranks: List[int] = None) -> None:
+    def _log(self, level, message: str, ranks: List[int] = None) -> None:
         if ranks is None:
             getattr(self._logger, level)(message)
         else:
-            local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
-            if local_rank in ranks:
+            if self.rank in ranks:
                 getattr(self._logger, level)(message)
 
-    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def info(self, message: str, ranks: List[int] = None) -> None:
         """Log an info message.
 
         Args:
@@ -150,10 +139,10 @@ class DistributedLogger:
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('info', message_prefix, parallel_mode, ranks)
-        self._log('info', message, parallel_mode, ranks)
+        self._log('info', message_prefix, ranks)
+        self._log('info', message, ranks)
 
-    def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def warning(self, message: str, ranks: List[int] = None) -> None:
         """Log a warning message.
 
         Args:
@@ -163,10 +152,10 @@ class DistributedLogger:
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('warning', message_prefix, parallel_mode, ranks)
-        self._log('warning', message, parallel_mode, ranks)
+        self._log('warning', message_prefix, ranks)
+        self._log('warning', message, ranks)
 
-    def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def debug(self, message: str, ranks: List[int] = None) -> None:
         """Log a debug message.
 
         Args:
@@ -176,10 +165,10 @@ class DistributedLogger:
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('debug', message_prefix, parallel_mode, ranks)
-        self._log('debug', message, parallel_mode, ranks)
+        self._log('debug', message_prefix, ranks)
+        self._log('debug', message, ranks)
 
-    def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
+    def error(self, message: str, ranks: List[int] = None) -> None:
         """Log an error message.
 
         Args:
@@ -189,5 +178,5 @@ class DistributedLogger:
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
-        self._log('error', message_prefix, parallel_mode, ranks)
-        self._log('error', message, parallel_mode, ranks)
+        self._log('error', message_prefix, ranks)
+        self._log('error', message, ranks)
diff --git a/colossalai/nn/__init__.py b/colossalai/nn/__init__.py
index 910ad2031..c6c4d3042 100644
--- a/colossalai/nn/__init__.py
+++ b/colossalai/nn/__init__.py
@@ -1,6 +1,5 @@
-from ._ops import *
+from .init import *
 from .layer import *
 from .loss import *
 from .lr_scheduler import *
-from .metric import *
 from .optimizer import *
diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
index b705632f8..edd986ef5 100644
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
@@ -1,10 +1,2 @@
-from .colossalai_layer import *
-from .parallel_1d import *
-from .parallel_2d import *
-from .parallel_2p5d import *
-from .parallel_3d import *
-from .parallel_sequence import *
 from .moe import *
 from .utils import *
-from .vanilla import *
-from .wrapper import *
diff --git a/colossalai/nn/layer/parallel_1d/__init__.py b/colossalai/nn/layer/parallel_1d/__init__.py
deleted file mode 100644
index 2353851df..000000000
--- a/colossalai/nn/layer/parallel_1d/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
-                     PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
-
-__all__ = [
-    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
-    'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
-]
diff --git a/colossalai/nn/layer/utils.py b/colossalai/nn/layer/utils.py
new file mode 100644
index 000000000..dc12ff8da
--- /dev/null
+++ b/colossalai/nn/layer/utils.py
@@ -0,0 +1,14 @@
+def divide(numerator, denominator):
+    """Only allow exact division.
+
+    Args:
+        numerator (int): Numerator of the division.
+        denominator (int): Denominator of the division.
+
+    Returns:
+        int: the result of exact division.
+    """
+    assert denominator != 0, 'denominator can not be zero'
+    assert numerator % denominator == 0, \
+        '{} is not divisible by {}'.format(numerator, denominator)
+    return numerator // denominator
diff --git a/colossalai/nn/layer/utils/__init__.py b/colossalai/nn/layer/utils/__init__.py
deleted file mode 100644
index 7e999ee82..000000000
--- a/colossalai/nn/layer/utils/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
-                     set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
-
-__all__ = [
-    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
-    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
-]
diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py
index 373e4ec94..ee2add48a 100644
--- a/colossalai/nn/loss/__init__.py
+++ b/colossalai/nn/loss/__init__.py
@@ -1,41 +1 @@
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn.layer.utils import get_tensor_parallel_mode
-from torch import nn
-from torch.nn.modules.loss import *
-from torch.nn.modules.loss import _Loss
-
-from .loss_1d import VocabParallelCrossEntropyLoss1D
-from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
-from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
-from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
 from .loss_moe import MoeCrossEntropyLoss, MoeLoss
-
-_parallel_cross_entropy = {
-    '2d': CrossEntropyLoss2D,
-    '2.5d': CrossEntropyLoss2p5D,
-    '3d': CrossEntropyLoss3D,
-}
-
-_vocab_parallel_cross_entropy = {
-    '1d': VocabParallelCrossEntropyLoss1D,
-    '2d': VocabParallelCrossEntropyLoss2D,
-    '2.5d': VocabParallelCrossEntropyLoss2p5D,
-    '3d': VocabParallelCrossEntropyLoss3D,
-}
-
-
-class CrossEntropyLoss(_Loss):
-
-    def __init__(self, reduction: bool = True, *args, **kwargs):
-        super().__init__()
-        tensor_parallel = get_tensor_parallel_mode()
-        if tensor_parallel is not None and env.vocab_parallel:
-            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-        elif tensor_parallel is None or tensor_parallel == '1d':
-            reduction = 'mean' if reduction else 'none'
-            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
-        else:
-            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
-
-    def forward(self, *args):
-        return self.loss(*args)
diff --git a/colossalai/nn/lr_scheduler/cosine.py b/colossalai/nn/lr_scheduler/cosine.py
index 0010435c2..fb587e1a1 100644
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
 
 
-@LR_SCHEDULERS.register_module
 class CosineAnnealingLR(_CosineAnnealingLR):
     r"""Set the learning rate of each parameter group using a cosine annealing
     schedule, where :math:`\eta_{max}` is set to the initial lr and
@@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
         super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class CosineAnnealingWarmupLR(WarmupScheduler):
     """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
 
@@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
         super().__init__(optimizer, warmup_steps, base_scheduler)
 
 
-@LR_SCHEDULERS.register_module
 class FlatAnnealingLR(DelayerScheduler):
     """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
 
@@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
         super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
     """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
     applied, and then the learning rate will be a fixed value before starting decay.
diff --git a/colossalai/nn/lr_scheduler/linear.py b/colossalai/nn/lr_scheduler/linear.py
index 251779647..21a865e4c 100644
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.legacy.registry import LR_SCHEDULERS
 
-
-@LR_SCHEDULERS.register_module
 class LinearWarmupLR(_LRScheduler):
     """Linearly warmup learning rate and then linearly decay.
 
diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py
index 4f18b49fc..c428c911c 100644
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@@ -2,12 +2,9 @@ from typing import List
 
 from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler
 
 
-@LR_SCHEDULERS.register_module
 class MultiStepLR(_MultiStepLR):
     """Decays the learning rate of each parameter group by gamma once the
     number of epoch reaches one of the milestones. Notice that such decay can
@@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
         super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
     """Multistep learning rate scheduler with warmup.
 
diff --git a/colossalai/nn/lr_scheduler/onecycle.py b/colossalai/nn/lr_scheduler/onecycle.py
index 20e9aaec6..6835b3ee1 100644
--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@@ -1,9 +1,6 @@
 from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
 
-
-@LR_SCHEDULERS.register_module
 class OneCycleLR(_OneCycleLR):
     r"""Sets the learning rate of each parameter group according to the
     1cycle learning rate policy. The 1cycle policy anneals the learning
diff --git a/colossalai/nn/lr_scheduler/poly.py b/colossalai/nn/lr_scheduler/poly.py
index a98506423..4f2249720 100644
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@@ -1,11 +1,8 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.legacy.registry import LR_SCHEDULERS
-
 from .delayed import WarmupScheduler
 
 
-@LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
     """Polynomial learning rate scheduler.
 
@@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
                 for base_lr in self.base_lrs]
 
 
-@LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
     """Polynomial learning rate scheduler with warmup.
 
diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py
index 09f5d4585..8846e13c7 100644
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR
 
-from colossalai.legacy.registry import LR_SCHEDULERS
 
-
-@LR_SCHEDULERS.register_module
 class LambdaLR(_LambdaLR):
     """Sets the learning rate of each parameter group to the initial lr
     times a given function. When last_epoch=-1, sets initial lr as lr.
@@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
         super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class MultiplicativeLR(_MultiplicativeLR):
     """Multiply the learning rate of each parameter group by the factor given
     in the specified function. When last_epoch=-1, sets initial lr as lr.
@@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
         super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class StepLR(_StepLR):
     """Decays the learning rate of each parameter group by gamma every
     step_size epochs. Notice that such decay can happen simultaneously with
@@ -61,7 +56,6 @@ class StepLR(_StepLR):
         super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
 
 
-@LR_SCHEDULERS.register_module
 class ExponentialLR(_ExponentialLR):
     """Decays the learning rate of each parameter group by gamma every epoch.
     When last_epoch=-1, sets initial lr as lr
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 210400a21..9767fcb8b 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -4,12 +4,10 @@ from typing import Optional
 import torch
 
 from colossalai.kernel.op_builder import CPUAdamBuilder
-from colossalai.legacy.registry import OPTIMIZERS
 
 from .nvme_optimizer import NVMeOptimizer
 
 
-@OPTIMIZERS.register_module
 class CPUAdam(NVMeOptimizer):
     """Implements Adam algorithm.
 
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index 0d13873cd..3a05a34f5 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -8,11 +8,9 @@ Licensed under the MIT License.
 '''
 import torch
 
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
-@OPTIMIZERS.register_module
 class FusedAdam(torch.optim.Optimizer):
     """Implements Adam algorithm.
 
diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py
index 48cc097c7..a2807d70f 100644
--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@@ -1,11 +1,9 @@
 # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
 import torch
 
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
-@OPTIMIZERS.register_module
 class FusedLAMB(torch.optim.Optimizer):
     """Implements LAMB algorithm.
 
diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py
index 0e8d3fc10..59a93a8be 100644
--- a/colossalai/nn/optimizer/fused_sgd.py
+++ b/colossalai/nn/optimizer/fused_sgd.py
@@ -2,11 +2,9 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
-@OPTIMIZERS.register_module
 class FusedSGD(Optimizer):
     r"""Implements stochastic gradient descent (optionally with momentum).
 
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 7aa0ced18..e08df410e 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -4,13 +4,11 @@ import torch
 from torch.optim import Adam
 
 from colossalai.kernel.op_builder import FusedOptimBuilder
-from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 from .cpu_adam import CPUAdam
 
 
-@OPTIMIZERS.register_module
 class HybridAdam(CPUAdam):
     """Implements Adam algorithm.
 
diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py
index 769c11f62..d5de267f7 100644
--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
@@ -5,10 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-
 import torch
 from torch.optim import Optimizer
 
-from colossalai.legacy.registry import OPTIMIZERS
 
-
-@OPTIMIZERS.register_module
 class Lamb(Optimizer):
     r"""Implements Lamb algorithm.
     It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
diff --git a/colossalai/nn/optimizer/lars.py b/colossalai/nn/optimizer/lars.py
index 9dbb83b84..58393fdae 100644
--- a/colossalai/nn/optimizer/lars.py
+++ b/colossalai/nn/optimizer/lars.py
@@ -5,10 +5,7 @@ from typing import Iterable
 import torch
 from torch.optim import Optimizer
 
-from colossalai.legacy.registry import OPTIMIZERS
 
-
-@OPTIMIZERS.register_module
 class Lars(Optimizer):
     r"""Implements the LARS optimizer from `"Large batch training of convolutional networks"
     <https://arxiv.org/pdf/1708.03888.pdf>`_.
diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py
index 79913987b..ba8b1591d 100644
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/pipeline/pipelinable.py
@@ -1,15 +1,24 @@
-import torch
 import inspect
+
+import torch
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.utils import CheckpointModule
+from colossalai.tensor import ColoParameter
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 
-from .utils import partition_uniform, partition_balanced, build_kwargs_for_function, \
-                build_kwargs_for_module, exec_func_with_kwargs, exec_funcs_with_kwargs, \
-                call_module, customized_partition
-from colossalai.nn.layer.utils import CheckpointModule
-from colossalai.tensor import ColoParameter
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
 from .layer_spec import LayerSpec
+from .utils import (
+    build_kwargs_for_function,
+    build_kwargs_for_module,
+    call_module,
+    customized_partition,
+    exec_func_with_kwargs,
+    exec_funcs_with_kwargs,
+    partition_balanced,
+    partition_uniform,
+)
 
 
 class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py
index ac8a3ad7d..be8428692 100644
--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@@ -1,12 +1,13 @@
 import heapq
 import inspect
-import torch
-
-from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.utils import CheckpointModule
+from collections import OrderedDict
 from typing import List
 
-from collections import OrderedDict
+import torch
+
+from colossalai.legacy.nn.layer.utils import CheckpointModule
+from colossalai.logging import get_dist_logger
+
 
 def _binary_partition(weights: List, start: int, end: int):
     """Returns the binary partition position of `weights`, given the start
@@ -162,7 +163,7 @@ def build_kwargs_for_module(function, input_tensor, kw_dict):
         kwargs_offset = 1
     elif isinstance(input_tensor, (tuple, OrderedDict)):
         #assert isinstance(input_tensor, tuple), f'input_tensor should be a torch.Tensor or a tuple object.'
-        # Huggingface will take their own structures based on OrderedDict as the output 
+        # Huggingface will take their own structures based on OrderedDict as the output
         # between layers so we've to close this check.
         kwargs_offset = len(input_tensor)
     args_name_list = list(sig.parameters.keys())
@@ -256,7 +257,7 @@ def call_module(module, args=None, kwargs=None):
 
 def customized_partition(exec_seq):
     '''
-    This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an 
+    This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an
     annotation to note the partition point.
     '''
     customized_parts = {}
diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py
index c968050de..4740a316b 100644
--- a/colossalai/tensor/dist_spec_mgr.py
+++ b/colossalai/tensor/dist_spec_mgr.py
@@ -2,7 +2,6 @@ from contextlib import contextmanager
 
 import torch
 import torch.distributed as dist
-# from colossalai.nn.layer.utils import divide
 from numpy import prod
 
 from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 7b2e8480c..6f9717d35 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,12 +1,14 @@
 from .activation_checkpoint import checkpoint
 from .checkpointing import load_checkpoint, save_checkpoint
 from .common import (
+    _cast_float,
     clip_grad_norm_fp32,
     conditional_context,
     copy_tensor_parallel_attributes,
     count_zeros_fp32,
     disposable,
     ensure_path_exists,
+    free_storage,
     is_ddp_ignored,
     is_dp_rank_0,
     is_model_parallel_parameter,
@@ -72,4 +74,6 @@ __all__ = [
     'disposable',
     'colo_set_cpu_memory_capacity',
     'colo_get_cpu_memory_capacity',
+    '_cast_float',
+    'free_storage',
 ]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 8022e84dc..998901708 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -470,3 +470,22 @@ def disposable(func: Callable) -> Callable:
             return func(*args, **kwargs)
 
     return wrapper
+
+
+def free_storage(data: torch.Tensor) -> None:
+    """Free underlying storage of a Tensor."""
+    if data.storage().size() > 0:
+        # Since we're modifying the Tensor's Storage directly, make sure the Tensor
+        # is the sole occupant of the Storage.
+        assert data.storage_offset() == 0
+        data.storage().resize_(0)
+
+
+def _cast_float(args, dtype: torch.dtype):
+    if isinstance(args, torch.Tensor) and torch.is_floating_point(args):
+        args = args.to(dtype)
+    elif isinstance(args, (list, tuple)):
+        args = type(args)(_cast_float(t, dtype) for t in args)
+    elif isinstance(args, dict):
+        args = {k: _cast_float(v, dtype) for k, v in args.items()}
+    return args
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
index 4ca7bce7b..881ddde78 100644
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -12,12 +12,10 @@ from torch.utils.data import DataLoader, Dataset, Sampler
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.legacy.registry import DATA_SAMPLERS
 
 T_co = TypeVar('T_co', covariant=True)
 
 
-@DATA_SAMPLERS.register_module
 class DataParallelSampler(Sampler):
     """A data sampler for distributed data parallelism.
 
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index 75f8576ca..dad852a34 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -87,7 +87,7 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
         self._default_dist_spec = default_dist_spec
 
     def _register_colo_modules(self):
-        from colossalai.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module
+        from colossalai.legacy.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module
         register_colo_module(torch.nn.Linear, ColoLinear())
         register_colo_module(torch.nn.Embedding, ColoEmbedding())
 
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 741a977d1..918b08cd3 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -10,15 +10,13 @@ import torch.nn as nn
 from torch.distributed import ProcessGroup
 from torch.distributed.distributed_c10d import _get_default_group
 
-from colossalai.checkpoint_io.utils import calculate_tensor_size, StateDictSharder
+from colossalai.checkpoint_io.utils import StateDictSharder, calculate_tensor_size
 from colossalai.interface import ModelWrapper
-
 from colossalai.lazy import LazyTensor
 from colossalai.logging import get_dist_logger
-from colossalai.nn.parallel.data_parallel import _cast_float, free_storage
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import get_current_device, is_ddp_ignored
+from colossalai.utils import _cast_float, free_storage, get_current_device, is_ddp_ignored
 
 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
 from .gemini_hook import GeminiZeROHook
@@ -780,5 +778,3 @@ class GeminiDDP(ModelWrapper):
                 yield block, block_size
 
         yield sharder.current_block, sharder.current_block_size
-
-
diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
index 0c9eac8b6..e5466965c 100644
--- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
+++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,7 +1,7 @@
 import torch.nn
 
-from colossalai.nn.parallel.data_parallel import _cast_float
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
+from colossalai.utils import _cast_float
 from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
     GradMemStats,
     GradMemTracerHook,
diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 281fd4755..0a94a7f5d 100644
--- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -176,7 +176,7 @@ In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overh
 
 ```python
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
+    from colossalai.zero import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
                         placement_policy=placement_policy,
diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 5aa806c64..36c94fb49 100644
--- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.legacy.trainer import Trainer, hooks
 from colossalai.utils.timer import MultiTimer
 from model_zoo.gpt import GPTLMLoss
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 22022639c..0ec9d5c3c 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -78,7 +78,7 @@ from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.lr_scheduler import LinearWarmupLR
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 ```
 
diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md
index 6d2355ad9..e17c37e24 100644
--- a/docs/source/en/basics/engine_trainer.md
+++ b/docs/source/en/basics/engine_trainer.md
@@ -344,7 +344,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 If you wish to train with a trainer object, you can follow the code snippet below:
 
 ```python
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 
 
diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
index 3f85d5045..dfd1e2910 100644
--- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
+++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md
@@ -160,7 +160,7 @@ for mn, module in model.named_modules():
 
 ```python
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
-    from colossalai.nn.parallel import GeminiDDP
+    from colossalai.zero import GeminiDDP
     model = GeminiDDP(model,
                         device=get_current_device(),
                         placement_policy=placement_policy,
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 9cfbf5873..3f57f39f2 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.legacy.trainer import Trainer, hooks
 from colossalai.utils.timer import MultiTimer
 from model_zoo.gpt import GPTLMLoss
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 803882a5a..f7dd8d477 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -73,7 +73,7 @@ from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.lr_scheduler import LinearWarmupLR
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 ```
 
diff --git a/docs/source/zh-Hans/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md
index e57220292..ed5100299 100644
--- a/docs/source/zh-Hans/basics/engine_trainer.md
+++ b/docs/source/zh-Hans/basics/engine_trainer.md
@@ -340,7 +340,7 @@ for epoch in range(gpc.config.NUM_EPOCHS):
 
 
 ```python
-from colossalai.nn.metric import Accuracy
+from colossalai.legacy.nn.metric import Accuracy
 from colossalai.legacy.trainer import Trainer, hooks
 
 
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
index 668992901..e521193a9 100644
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
 
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
+from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
+from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
+from colossalai.legacy.nn.layer.utils import divide
 from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
-from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
-from colossalai.nn.layer.utils import divide
 from colossalai.utils import get_current_device
 
 
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
index 2edd03606..72297c540 100644
--- a/examples/language/gpt/titans/model/gpt1d.py
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -11,9 +11,9 @@ from colossalai import kernel
 from colossalai import nn as col_nn
 from colossalai.core import global_context as gpc
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
-from colossalai.nn.layer import Linear1D_Col, Linear1D_Row
-from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.nn.layer.utils import ACT2FN, divide
+from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
+from colossalai.legacy.nn.layer.base_layer import ParallelLayer
+from colossalai.legacy.nn.layer.utils import ACT2FN, divide
 from colossalai.utils import checkpoint
 from colossalai.utils.activation_checkpoint import checkpoint
 
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
index 30180285b..9b22d156b 100644
--- a/examples/language/gpt/titans/model/pipeline_gpt1d.py
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -9,8 +9,8 @@ from colossalai import kernel
 from colossalai import nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.pipeline.utils import partition_uniform
 
 from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh
index e0dbef354..24cee1da3 100644
--- a/examples/tutorial/hybrid_parallel/test_ci.sh
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 set -euxo pipefail
 
-pip install -r requirements.txt
-colossalai run --nproc_per_node 4 train.py --config config.py
+echo "legacy example"
+
+# pip install -r requirements.txt
+# colossalai run --nproc_per_node 4 train.py --config config.py
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 4953d5350..12cdec902 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -7,8 +7,8 @@ from tqdm import tqdm
 import colossalai
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn import CrossEntropyLoss
 from colossalai.logging import get_dist_logger
-from colossalai.nn import CrossEntropyLoss
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.pipeline.pipelinable import PipelinableContext
 from colossalai.utils import is_using_pp
diff --git a/examples/tutorial/sequence_parallel/model/bert.py b/examples/tutorial/sequence_parallel/model/bert.py
index 049579c5a..b8adb501f 100644
--- a/examples/tutorial/sequence_parallel/model/bert.py
+++ b/examples/tutorial/sequence_parallel/model/bert.py
@@ -1,33 +1,37 @@
-from colossalai.context.parallel_mode import ParallelMode
+import inspect
+
 import torch
 import torch.nn as nn
-import inspect
-from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding
-from .layers.init_method import init_normal, output_init_normal
-from colossalai.core import global_context as gpc
+
 from colossalai.context import ParallelMode
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.kernel import LayerNorm
-from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
 from colossalai.logging import get_dist_logger
 from colossalai.pipeline.utils import partition_uniform
 
+from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
+from .layers.init_method import init_normal, output_init_normal
+
 
 class BertForPretrain(nn.Module):
 
-    def __init__(self,
-                 vocab_size,
-                 hidden_size,
-                 max_sequence_length,
-                 num_attention_heads,
-                 num_layers,
-                 add_binary_head,
-                 is_naive_fp16,
-                 num_tokentypes=2,
-                 dropout_prob=0.1,
-                 mlp_ratio=4,
-                 init_std=0.02,
-                 convert_fp16_to_fp32_in_softmax=False,
-                 ):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        num_attention_heads,
+        num_layers,
+        add_binary_head,
+        is_naive_fp16,
+        num_tokentypes=2,
+        dropout_prob=0.1,
+        mlp_ratio=4,
+        init_std=0.02,
+        convert_fp16_to_fp32_in_softmax=False,
+    ):
         super().__init__()
         self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE)
         assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size'
@@ -47,19 +51,19 @@ class BertForPretrain(nn.Module):
         self.bert_layers = nn.ModuleList()
 
         for i in range(num_layers):
-            bert_layer = BertLayer(layer_number=i+1,
+            bert_layer = BertLayer(layer_number=i + 1,
                                    hidden_size=hidden_size,
                                    num_attention_heads=num_attention_heads,
                                    attention_dropout=dropout_prob,
                                    mlp_ratio=mlp_ratio,
                                    hidden_dropout=dropout_prob,
                                    convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-                                   is_naive_fp16=is_naive_fp16
-                                   )
+                                   is_naive_fp16=is_naive_fp16)
             self.bert_layers.append(bert_layer)
 
         self.layer_norm = LayerNorm(hidden_size)
-        self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0),
+        self.head = BertDualHead(hidden_size,
+                                 self.embedding.word_embedding_weight.size(0),
                                  add_binary_head=add_binary_head)
         self.reset_parameters()
 
@@ -166,22 +170,20 @@ class PipelineBertForPretrain(nn.Module):
             end_idx = num_layers
 
         for i in range(start_idx, end_idx):
-            bert_layer = BertLayer(layer_number=i+1,
+            bert_layer = BertLayer(layer_number=i + 1,
                                    hidden_size=hidden_size,
                                    num_attention_heads=num_attention_heads,
                                    attention_dropout=dropout_prob,
                                    mlp_ratio=mlp_ratio,
                                    hidden_dropout=dropout_prob,
                                    convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-                                   is_naive_fp16=is_naive_fp16
-                                   )
+                                   is_naive_fp16=is_naive_fp16)
             self.bert_layers.append(bert_layer)
 
         if self.last_stage:
             self.word_embeddings = VocabEmbedding(vocab_size, hidden_size)
             self.layer_norm = LayerNorm(hidden_size)
-            self.head = BertDualHead(hidden_size, vocab_size,
-                                     add_binary_head=add_binary_head)
+            self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head)
         self.reset_parameters()
 
     def _init_normal(self, tensor):
diff --git a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
index 4ede21516..56ba511d8 100644
--- a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
+++ b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py
@@ -1,10 +1,12 @@
 import torch
 import torch.nn as nn
-from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing
-from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference
+
 from colossalai.kernel.cuda_native import LayerNorm
-from .mlp import TransformerMLP
+from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train
+from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing
+
 from .dropout import get_bias_dropout_add
+from .mlp import TransformerMLP
 
 
 def attention_mask_func(attention_scores, attention_mask):
@@ -48,8 +50,7 @@ class BertLayer(nn.Module):
             layer_number=layer_number,
             apply_query_key_layer_scaling=True,
             convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax,
-            fp16=is_naive_fp16
-        )
+            fp16=is_naive_fp16)
 
         self.hidden_dropout = hidden_dropout
         self.bias_dropout_fusion = bias_dropout_fusion
@@ -89,11 +90,8 @@ class BertLayer(nn.Module):
 
         # re-enable torch grad to enable fused optimization.
         with torch.enable_grad():
-            layernorm_input = bias_dropout_add_func(
-                attention_output,
-                attention_bias.expand_as(residual),
-                residual,
-                self.hidden_dropout)
+            layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual,
+                                                    self.hidden_dropout)
 
         # Layer norm post the self attention.
         layernorm_output = self.post_attention_layernorm(layernorm_input)
@@ -109,10 +107,6 @@ class BertLayer(nn.Module):
 
         # re-enable torch grad to enable fused optimization.
         with torch.enable_grad():
-            output = bias_dropout_add_func(
-                mlp_output,
-                mlp_bias.expand_as(residual),
-                residual,
-                self.hidden_dropout)
+            output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout)
 
         return output
diff --git a/tests/components_to_test/hanging_param_model.py b/tests/components_to_test/hanging_param_model.py
index 329a08ea2..0e6543121 100644
--- a/tests/components_to_test/hanging_param_model.py
+++ b/tests/components_to_test/hanging_param_model.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils.dummy_data_generator import DummyDataGenerator
diff --git a/tests/components_to_test/inline_op_model.py b/tests/components_to_test/inline_op_model.py
index f061d48f9..80757f361 100644
--- a/tests/components_to_test/inline_op_model.py
+++ b/tests/components_to_test/inline_op_model.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils.dummy_data_generator import DummyDataGenerator
diff --git a/tests/components_to_test/nested_model.py b/tests/components_to_test/nested_model.py
index 339084639..3e779b0a6 100644
--- a/tests/components_to_test/nested_model.py
+++ b/tests/components_to_test/nested_model.py
@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils import DummyDataGenerator
diff --git a/tests/components_to_test/repeated_computed_layers.py b/tests/components_to_test/repeated_computed_layers.py
index b3f84bd0e..c1ef99aa0 100644
--- a/tests/components_to_test/repeated_computed_layers.py
+++ b/tests/components_to_test/repeated_computed_layers.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 
 from .registry import non_distributed_component_funcs
 from .utils.dummy_data_generator import DummyDataGenerator
diff --git a/tests/components_to_test/simple_net.py b/tests/components_to_test/simple_net.py
index cd9d7ebc0..064974a15 100644
--- a/tests/components_to_test/simple_net.py
+++ b/tests/components_to_test/simple_net.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 
-from colossalai.nn import CheckpointModule
+from colossalai.legacy.nn import CheckpointModule
 from colossalai.utils.cuda import get_current_device
 
 from .registry import non_distributed_component_funcs
diff --git a/tests/test_comm/test_boardcast_send_recv_v2.py b/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py
similarity index 93%
rename from tests/test_comm/test_boardcast_send_recv_v2.py
rename to tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py
index 253f6f21c..c5fb049fe 100644
--- a/tests/test_comm/test_boardcast_send_recv_v2.py
+++ b/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py
@@ -1,10 +1,10 @@
 import pytest
 import torch
 
-from colossalai.communication.p2p_v2 import _recv_object, _send_object
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.legacy.communication.p2p_v2 import _recv_object, _send_object
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_comm/test_comm.py b/tests/test_legacy/test_comm/test_comm.py
similarity index 96%
rename from tests/test_comm/test_comm.py
rename to tests/test_legacy/test_comm/test_comm.py
index 747596bd2..3251d8d46 100644
--- a/tests/test_comm/test_comm.py
+++ b/tests/test_legacy/test_comm/test_comm.py
@@ -2,10 +2,10 @@ import pytest
 import torch
 import torch.distributed as dist
 
-from colossalai.communication import all_gather, all_reduce, reduce_scatter
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.legacy.communication import all_gather, all_reduce, reduce_scatter
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
 
diff --git a/tests/test_comm/test_object_list_p2p.py b/tests/test_legacy/test_comm/test_object_list_p2p.py
similarity index 98%
rename from tests/test_comm/test_object_list_p2p.py
rename to tests/test_legacy/test_comm/test_object_list_p2p.py
index e9d7630c1..f50982ee1 100644
--- a/tests/test_comm/test_object_list_p2p.py
+++ b/tests/test_legacy/test_comm/test_object_list_p2p.py
@@ -1,7 +1,10 @@
 import pytest
 import torch
 
-from colossalai.communication.p2p import (
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.initialize import launch
+from colossalai.legacy.communication.p2p import (
     recv_backward,
     recv_forward,
     send_backward,
@@ -9,9 +12,6 @@ from colossalai.communication.p2p import (
     send_forward,
     send_forward_recv_backward,
 )
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 CONFIG = dict(parallel=dict(pipeline=2))
diff --git a/tests/test_comm/test_object_list_p2p_v2.py b/tests/test_legacy/test_comm/test_object_list_p2p_v2.py
similarity index 97%
rename from tests/test_comm/test_object_list_p2p_v2.py
rename to tests/test_legacy/test_comm/test_object_list_p2p_v2.py
index cae38385b..040c63322 100644
--- a/tests/test_comm/test_object_list_p2p_v2.py
+++ b/tests/test_legacy/test_comm/test_object_list_p2p_v2.py
@@ -1,10 +1,10 @@
 import pytest
 import torch
 
-from colossalai.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.legacy.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_engine/test_engine.py b/tests/test_legacy/test_engine/test_engine.py
similarity index 100%
rename from tests/test_engine/test_engine.py
rename to tests/test_legacy/test_engine/test_engine.py
diff --git a/tests/test_engine/test_gradient_accumluation.py b/tests/test_legacy/test_engine/test_gradient_accumluation.py
similarity index 100%
rename from tests/test_engine/test_gradient_accumluation.py
rename to tests/test_legacy/test_engine/test_gradient_accumluation.py
diff --git a/tests/test_layers/test_1d/checks_1d/__init__.py b/tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py
similarity index 100%
rename from tests/test_layers/test_1d/checks_1d/__init__.py
rename to tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py
diff --git a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
similarity index 99%
rename from tests/test_layers/test_1d/checks_1d/check_layer_1d.py
rename to tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
index 668b8a334..dcb2be626 100644
--- a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py
+++ b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
@@ -5,7 +5,7 @@ from torch.nn import Parameter
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.nn import (
+from colossalai.legacy.nn import (
     Classifier1D,
     Embedding1D,
     Linear1D_Col,
diff --git a/tests/test_layers/test_1d/checks_1d/common.py b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py
similarity index 94%
rename from tests/test_layers/test_1d/checks_1d/common.py
rename to tests/test_legacy/test_layers/test_1d/checks_1d/common.py
index 8b7b28613..29a9a3d20 100644
--- a/tests/test_layers/test_1d/checks_1d/common.py
+++ b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py
@@ -1,15 +1,16 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch
-
-DEPTH = 4
-BATCH_SIZE = 8
-SEQ_LENGTH = 8
-IMG_SIZE = 16
-HIDDEN_SIZE = 8
-NUM_CLASSES = 8
-VOCAB_SIZE = 16
-
-def check_equal(A, B):
-    assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+DEPTH = 4
+BATCH_SIZE = 8
+SEQ_LENGTH = 8
+IMG_SIZE = 16
+HIDDEN_SIZE = 8
+NUM_CLASSES = 8
+VOCAB_SIZE = 16
+
+
+def check_equal(A, B):
+    assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True
diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_legacy/test_layers/test_1d/test_1d.py
similarity index 100%
rename from tests/test_layers/test_1d/test_1d.py
rename to tests/test_legacy/test_layers/test_1d/test_1d.py
diff --git a/tests/test_layers/test_2d/checks_2d/__init__.py b/tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py
similarity index 100%
rename from tests/test_layers/test_2d/checks_2d/__init__.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py
diff --git a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
similarity index 97%
rename from tests/test_layers/test_2d/checks_2d/check_layer_2d.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
index e030e473a..0ee88c260 100644
--- a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
@@ -1,12 +1,23 @@
 import torch
+
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn import (Classifier2D, CrossEntropyLoss2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D,
-                           VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2D,
-                           VocabParallelCrossEntropyLoss2D, VocabParallelEmbedding2D)
+from colossalai.legacy.nn import (
+    Classifier2D,
+    CrossEntropyLoss2D,
+    Embedding2D,
+    LayerNorm2D,
+    Linear2D,
+    PatchEmbedding2D,
+    VanillaClassifier,
+    VanillaPatchEmbedding,
+    VocabParallelClassifier2D,
+    VocabParallelCrossEntropyLoss2D,
+    VocabParallelEmbedding2D,
+)
 from colossalai.utils import get_current_device, print_rank_0
 
-from .common import (BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal)
+from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
 
 
 def check_linear():
@@ -336,7 +347,7 @@ def check_classifier_no_given_weight():
     layer.weight.data.copy_(W)
     # W.requires_grad = True
 
-    B_shape = (OUTPUT_SIZE, )
+    B_shape = (OUTPUT_SIZE,)
     B_master = torch.randint(5, B_shape, dtype=dtype, device=device)
     torch.distributed.broadcast(B_master, src=0)
     # B = torch.chunk(B_master, DEPTH, dim=0)[j]
@@ -572,7 +583,7 @@ def check_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, DEPTH, dim=0)[i]
@@ -607,7 +618,7 @@ def check_vocab_parallel_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, DEPTH, dim=0)[i]
diff --git a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
similarity index 96%
rename from tests/test_layers/test_2d/checks_2d/check_operation_2d.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
index a5e37b1ec..ae1d1120c 100644
--- a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
@@ -5,10 +5,10 @@ import torch
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
-from colossalai.utils import get_current_device
-from colossalai.utils import print_rank_0
-from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH
+from colossalai.legacy.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
+from colossalai.utils import get_current_device, print_rank_0
+
+from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal
 
 
 def check_AB():
diff --git a/tests/test_layers/test_2d/checks_2d/common.py b/tests/test_legacy/test_layers/test_2d/checks_2d/common.py
similarity index 100%
rename from tests/test_layers/test_2d/checks_2d/common.py
rename to tests/test_legacy/test_layers/test_2d/checks_2d/common.py
diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_legacy/test_layers/test_2d/test_2d.py
similarity index 100%
rename from tests/test_layers/test_2d/test_2d.py
rename to tests/test_legacy/test_layers/test_2d/test_2d.py
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/__init__.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py
similarity index 100%
rename from tests/test_layers/test_2p5d/checks_2p5d/__init__.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
similarity index 98%
rename from tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
index a8f551093..5a99b05cf 100644
--- a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
@@ -1,11 +1,22 @@
 import torch
+from torch.nn import Parameter
+
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn import (Classifier2p5D, CrossEntropyLoss2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D,
-                           PatchEmbedding2p5D, VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2p5D,
-                           VocabParallelCrossEntropyLoss2p5D, VocabParallelEmbedding2p5D)
+from colossalai.legacy.nn import (
+    Classifier2p5D,
+    CrossEntropyLoss2p5D,
+    Embedding2p5D,
+    LayerNorm2p5D,
+    Linear2p5D,
+    PatchEmbedding2p5D,
+    VanillaClassifier,
+    VanillaPatchEmbedding,
+    VocabParallelClassifier2p5D,
+    VocabParallelCrossEntropyLoss2p5D,
+    VocabParallelEmbedding2p5D,
+)
 from colossalai.utils import get_current_device, print_rank_0
-from torch.nn import Parameter
 
 from .common import *
 
@@ -342,7 +353,7 @@ def check_classifier_no_given_weight():
     layer.weight.data.copy_(W)
     # W.requires_grad = True
 
-    B_shape = (OUTPUT_SIZE, )
+    B_shape = (OUTPUT_SIZE,)
     B_master = torch.randint(5, B_shape, dtype=dtype, device=device)
     torch.distributed.broadcast(B_master, src=0)
     # B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[j]
@@ -577,7 +588,7 @@ def check_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i]
@@ -612,7 +623,7 @@ def check_vocab_parallel_loss():
 
     out_shape = (BATCH_SIZE, NUM_CLASSES)
     out_master = torch.randn(out_shape, dtype=dtype, device=device)
-    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device)
+    target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device)
     torch.distributed.broadcast(out_master, src=0)
     torch.distributed.broadcast(target_master, src=0)
     out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i]
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
similarity index 97%
rename from tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
index d0c3b02fc..db1996767 100644
--- a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
@@ -2,10 +2,9 @@ import torch
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, \
-    Matmul_ATB_2p5D
-from colossalai.utils import get_current_device
-from colossalai.utils import print_rank_0
+from colossalai.legacy.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D
+from colossalai.utils import get_current_device, print_rank_0
+
 from .common import *
 
 
diff --git a/tests/test_layers/test_2p5d/checks_2p5d/common.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py
similarity index 75%
rename from tests/test_layers/test_2p5d/checks_2p5d/common.py
rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py
index aff85f109..c90d8fc08 100644
--- a/tests/test_layers/test_2p5d/checks_2p5d/common.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py
@@ -11,4 +11,4 @@ IMG_SIZE = 16
 
 
 def check_equal(A, B):
-    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2)
\ No newline at end of file
+    assert torch.allclose(A, B, rtol=1e-5, atol=1e-2)
diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
similarity index 100%
rename from tests/test_layers/test_2p5d/test_2p5d.py
rename to tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
diff --git a/tests/test_layers/test_3d/checks_3d/__init__.py b/tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py
similarity index 100%
rename from tests/test_layers/test_3d/checks_3d/__init__.py
rename to tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py
diff --git a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
similarity index 99%
rename from tests/test_layers/test_3d/checks_3d/check_layer_3d.py
rename to tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
index e946a1f59..cee639a9f 100644
--- a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py
+++ b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
@@ -7,8 +7,7 @@ import torch
 
 from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.core import global_context
-from colossalai.logging import get_dist_logger
-from colossalai.nn import (
+from colossalai.legacy.nn import (
     Classifier3D,
     CrossEntropyLoss3D,
     Embedding3D,
@@ -21,7 +20,8 @@ from colossalai.nn import (
     VocabParallelCrossEntropyLoss3D,
     VocabParallelEmbedding3D,
 )
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device, print_rank_0
 
 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
diff --git a/tests/test_layers/test_3d/checks_3d/common.py b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py
similarity index 95%
rename from tests/test_layers/test_3d/checks_3d/common.py
rename to tests/test_legacy/test_layers/test_3d/checks_3d/common.py
index afb19c474..509fc2cec 100644
--- a/tests/test_layers/test_3d/checks_3d/common.py
+++ b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py
@@ -16,4 +16,4 @@ VOCAB_SIZE = 16
 def check_equal(A, B):
     eq = torch.allclose(A, B, rtol=1e-3, atol=1e-2)
     assert eq, f"\nA = {A}\nB = {B}"
-    return eq
\ No newline at end of file
+    return eq
diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_legacy/test_layers/test_3d/test_3d.py
similarity index 100%
rename from tests/test_layers/test_3d/test_3d.py
rename to tests/test_legacy/test_layers/test_3d/test_3d.py
diff --git a/tests/test_layers/test_cache_embedding.py b/tests/test_legacy/test_layers/test_cache_embedding.py
similarity index 99%
rename from tests/test_layers/test_cache_embedding.py
rename to tests/test_legacy/test_layers/test_cache_embedding.py
index 22d4f02a4..0760a3f1e 100644
--- a/tests/test_layers/test_cache_embedding.py
+++ b/tests/test_legacy/test_layers/test_cache_embedding.py
@@ -6,7 +6,7 @@ import pytest
 import torch
 
 import colossalai
-from colossalai.nn.parallel.layers import (
+from colossalai.legacy.nn.parallel.layers import (
     CachedEmbeddingBag,
     CachedParamMgr,
     EvictionStrategy,
diff --git a/tests/test_layers/test_sequence/checks_seq/__init__.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py
similarity index 100%
rename from tests/test_layers/test_sequence/checks_seq/__init__.py
rename to tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py
diff --git a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
similarity index 91%
rename from tests/test_layers/test_sequence/checks_seq/check_layer_seq.py
rename to tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
index 2b7b999d4..7ff91a7b7 100644
--- a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py
+++ b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
@@ -2,7 +2,7 @@ import torch
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn import TransformerSelfAttentionRing
+from colossalai.legacy.nn import TransformerSelfAttentionRing
 from colossalai.utils import get_current_device
 
 
diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_legacy/test_layers/test_sequence/test_sequence.py
similarity index 97%
rename from tests/test_layers/test_sequence/test_sequence.py
rename to tests/test_legacy/test_layers/test_sequence/test_sequence.py
index 60f2d55f4..b9e6c1247 100644
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_legacy/test_layers/test_sequence/test_sequence.py
@@ -5,6 +5,7 @@ import torch.distributed as dist
 import colossalai
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.nn.layer.parallel_sequence import RingAV, RingQK
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 CONFIG = dict(parallel=dict(tensor=dict(size=4, mode='sequence')))
@@ -42,7 +43,7 @@ def check_ring_qk(rank, world_size):
     a = torch.matmul(q, k.transpose(2, 1))
 
     # compute distributed attention scores
-    ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply
+    ring_qk = RingQK.apply
     sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length)
 
     # check master and distributed attention scores
@@ -95,7 +96,7 @@ def check_ring_av(rank, world_size):
     out = torch.matmul(a, v)
 
     # compute distributed attention scores
-    ring_av = colossalai.nn.layer.parallel_sequence.RingAV.apply
+    ring_av = RingAV.apply
     sub_out = ring_av(sub_a, sub_v, batch_size, num_heads, attention_head_size, sub_seq_length)
 
     # print(f'master output shape: {out.shape}, partial output shape: {sub_out.shape}')
diff --git a/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py
index 8ad366133..5fb678525 100644
--- a/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py
+++ b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py
@@ -5,7 +5,10 @@ import pytest
 import torch
 import torch.distributed as dist
 
-from colossalai.communication import (
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.initialize import launch
+from colossalai.legacy.communication import (
     recv_backward,
     recv_forward,
     recv_obj_meta,
@@ -15,9 +18,6 @@ from colossalai.communication import (
     send_forward_recv_backward,
     send_obj_meta,
 )
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.logging import get_dist_logger
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
diff --git a/tests/test_pipeline/test_cuda_rpc_performance.py b/tests/test_pipeline/test_cuda_rpc_performance.py
deleted file mode 100644
index 4bacb2181..000000000
--- a/tests/test_pipeline/test_cuda_rpc_performance.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import os
-import time
-
-import pytest
-import torch
-import torch.nn as nn
-from rpc_test_utils import parse_args, rpc_run
-from titans.dataloader.cifar10 import build_cifar
-from torchvision.models import resnet50
-from tqdm import tqdm
-
-from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.pipeline.rpc import OneFOneBPipelineEngine
-
-
-def flatten(x):
-    return torch.flatten(x, 1)
-
-
-def partition(pp_rank: int, chunk: int, stage_num: int):
-    pipelinable = PipelinableContext()
-
-    # build model partitions
-    with pipelinable:
-        # input : [B, 3, 32, 32]
-        _ = resnet50()
-
-    pipelinable.policy = "customized"
-
-    exec_seq = [
-        'conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool', (flatten, "behind"), 'fc'
-    ]
-    pipelinable.to_layer_list(exec_seq)
-    partition = pipelinable.partition(chunk, stage_num, pp_rank)
-    return partition
-
-
-def run_master(args):
-    batch_size = args.batch_size
-    chunk = args.chunk
-    device = args.device
-    world_size = args.world_size
-    stage_num = world_size
-    num_microbatches = args.num_microbatches
-
-    # build dataloader
-    root = os.environ.get('DATA', './data')
-    train_dataloader, test_dataloader = build_cifar(batch_size, root, padding=4, crop=32, resize=32)
-    criterion = nn.CrossEntropyLoss()
-
-    pp_engine = OneFOneBPipelineEngine(partition_fn=partition,
-                                       stage_num=stage_num,
-                                       num_microbatches=num_microbatches,
-                                       device=device,
-                                       chunk=chunk,
-                                       criterion=criterion,
-                                       checkpoint=False)
-
-    pp_engine.initialize_optimizer(torch.optim.Adam, lr=1e-3)
-    s = time.time()
-
-    for bx, by in tqdm(train_dataloader):
-        pp_engine.forward_backward(bx, labels=by, forward_only=False)
-
-    cost_time = time.time() - s
-
-    print("total cost time :", cost_time)
-    print("cost time per batch:", cost_time / len(train_dataloader))
-
-
-@pytest.mark.skip("Test for performance, no need for CI")
-def main():
-    args = parse_args()
-    # this is due to limitation of partition function
-    args.world_size = 2
-    args.chunk = 1
-    rpc_run(args, run_master)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
index 335be6135..9c3a7e216 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
@@ -7,7 +7,7 @@ import pytest
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
index 175d9ef6c..03b2e4f2a 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
@@ -7,7 +7,7 @@ import pytest
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
index 33cb3a65d..cafffd0a6 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
@@ -7,7 +7,7 @@ import pytest
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
index 73ac2dd5f..9b43be9e8 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
+++ b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
@@ -7,7 +7,7 @@ import pytest
 import torch
 import torch.nn as nn
 
-import colossalai.nn as col_nn
+import colossalai.legacy.nn as col_nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch