diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
index 8e06cec4f..1a90c72bd 100644
--- a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py
@@ -1,5 +1,5 @@
 class Registry:
-    # TODO: refactor the registry classes used in colossalai.registry, colossalai.fx and here
+    # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
 
     def __init__(self, name):
         self.name = name
diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index 003f0cdd9..7186f052e 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -15,8 +15,8 @@ from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
 from colossalai.context.config import Config
 from colossalai.context.singleton_meta import SingletonMeta
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 from colossalai.logging import get_dist_logger
-from colossalai.registry import DIST_GROUP_INITIALIZER
 
 from .parallel_mode import ParallelMode
 from .random import add_seed, get_seeds, set_mode
diff --git a/colossalai/context/process_group_initializer/initializer_1d.py b/colossalai/context/process_group_initializer/initializer_1d.py
index 4c0502804..ba601d0bf 100644
--- a/colossalai/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/context/process_group_initializer/initializer_1d.py
@@ -2,8 +2,9 @@
 # -*- encoding: utf-8 -*-
 
 import torch.distributed as dist
+
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .process_group_initializer import ProcessGroupInitializer
diff --git a/colossalai/context/process_group_initializer/initializer_2d.py b/colossalai/context/process_group_initializer/initializer_2d.py
index 7fbe3be59..999cd5f0c 100644
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/context/process_group_initializer/initializer_2d.py
@@ -3,7 +3,7 @@ import math
 import torch.distributed as dist
 
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .process_group_initializer import ProcessGroupInitializer
diff --git a/colossalai/context/process_group_initializer/initializer_2p5d.py b/colossalai/context/process_group_initializer/initializer_2p5d.py
index 6b6fdc5d7..b92ae2eec 100644
--- a/colossalai/context/process_group_initializer/initializer_2p5d.py
+++ b/colossalai/context/process_group_initializer/initializer_2p5d.py
@@ -4,9 +4,10 @@
 import math
 
 import torch.distributed as dist
+
 from colossalai.context import Config
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .process_group_initializer import ProcessGroupInitializer
diff --git a/colossalai/context/process_group_initializer/initializer_3d.py b/colossalai/context/process_group_initializer/initializer_3d.py
index 1ed8eec86..6bca05ad7 100644
--- a/colossalai/context/process_group_initializer/initializer_3d.py
+++ b/colossalai/context/process_group_initializer/initializer_3d.py
@@ -6,7 +6,7 @@ import math
 import torch.distributed as dist
 
 from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .process_group_initializer import ProcessGroupInitializer
diff --git a/colossalai/context/process_group_initializer/initializer_data.py b/colossalai/context/process_group_initializer/initializer_data.py
index 9715ebff7..b9dec4541 100644
--- a/colossalai/context/process_group_initializer/initializer_data.py
+++ b/colossalai/context/process_group_initializer/initializer_data.py
@@ -3,7 +3,7 @@
 
 from torch import distributed as dist
 
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .process_group_initializer import ProcessGroupInitializer
diff --git a/colossalai/context/process_group_initializer/initializer_model.py b/colossalai/context/process_group_initializer/initializer_model.py
index 99b9cc0d4..614ba372f 100644
--- a/colossalai/context/process_group_initializer/initializer_model.py
+++ b/colossalai/context/process_group_initializer/initializer_model.py
@@ -2,9 +2,11 @@
 # -*- encoding: utf-8 -*-
 
 import torch.distributed as dist
-from colossalai.registry import DIST_GROUP_INITIALIZER
-from .process_group_initializer import ProcessGroupInitializer
+
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
+
 from ..parallel_mode import ParallelMode
+from .process_group_initializer import ProcessGroupInitializer
 
 
 @DIST_GROUP_INITIALIZER.register_module
diff --git a/colossalai/context/process_group_initializer/initializer_pipeline.py b/colossalai/context/process_group_initializer/initializer_pipeline.py
index 0ddb52f63..e093333ad 100644
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@@ -3,7 +3,7 @@
 
 from torch import distributed as dist
 
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .process_group_initializer import ProcessGroupInitializer
diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/context/process_group_initializer/initializer_sequence.py
index 251a29407..a6e26b6bc 100644
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@@ -2,7 +2,7 @@
 # -*- encoding: utf-8 -*-
 import torch.distributed as dist
 
-from colossalai.registry import DIST_GROUP_INITIALIZER
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
 from .initializer_tensor import Initializer_Tensor
diff --git a/colossalai/context/process_group_initializer/initializer_tensor.py b/colossalai/context/process_group_initializer/initializer_tensor.py
index d2b5be9cf..3be89e52a 100644
--- a/colossalai/context/process_group_initializer/initializer_tensor.py
+++ b/colossalai/context/process_group_initializer/initializer_tensor.py
@@ -3,9 +3,10 @@
 
 import torch.distributed as dist
 
-from colossalai.registry import DIST_GROUP_INITIALIZER
-from .process_group_initializer import ProcessGroupInitializer
+from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
+
 from ..parallel_mode import ParallelMode
+from .process_group_initializer import ProcessGroupInitializer
 
 
 @DIST_GROUP_INITIALIZER.register_module
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 32354dde8..a1694e059 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -17,10 +17,10 @@ from torch.utils.data import DataLoader
 
 from colossalai.amp import AMP_TYPE, convert_to_amp
 from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.builder.builder import build_gradient_handler
 from colossalai.context import Config, ConfigException, ParallelMode
 from colossalai.context.moe_context import MOE_CONTEXT
 from colossalai.core import global_context as gpc
+from colossalai.legacy.builder.builder import build_gradient_handler
 from colossalai.legacy.engine import Engine
 from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
 from colossalai.legacy.engine.schedule import (
diff --git a/colossalai/builder/__init__.py b/colossalai/legacy/builder/__init__.py
similarity index 100%
rename from colossalai/builder/__init__.py
rename to colossalai/legacy/builder/__init__.py
diff --git a/colossalai/builder/builder.py b/colossalai/legacy/builder/builder.py
similarity index 98%
rename from colossalai/builder/builder.py
rename to colossalai/legacy/builder/builder.py
index a14509392..ff14f46dc 100644
--- a/colossalai/builder/builder.py
+++ b/colossalai/legacy/builder/builder.py
@@ -3,7 +3,7 @@
 
 import inspect
 
-from colossalai.registry import *
+from colossalai.legacy.registry import *
 
 
 def build_from_config(module, config: dict):
diff --git a/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
index d0196e3c4..c5da2e55a 100644
--- a/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
@@ -1,6 +1,6 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
diff --git a/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
index f2db95752..395d83da0 100644
--- a/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
@@ -1,7 +1,7 @@
 from colossalai.context.moe_context import MOE_CONTEXT
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.utils.moe import get_moe_epsize_param_dict
 
 from ._base_gradient_handler import BaseGradientHandler
diff --git a/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
index 5b49a9c03..7d4d9d73a 100644
--- a/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -7,7 +7,7 @@ import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
 
diff --git a/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
index f13568094..41098ab39 100644
--- a/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
@@ -1,6 +1,6 @@
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
diff --git a/colossalai/legacy/engine/gradient_handler/_zero_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_zero_gradient_handler.py
index 19fd1e97f..4ca7cd0b0 100644
--- a/colossalai/legacy/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_zero_gradient_handler.py
@@ -1,4 +1,4 @@
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
 
diff --git a/colossalai/registry/__init__.py b/colossalai/legacy/registry/__init__.py
similarity index 100%
rename from colossalai/registry/__init__.py
rename to colossalai/legacy/registry/__init__.py
diff --git a/colossalai/registry/registry.py b/colossalai/legacy/registry/registry.py
similarity index 98%
rename from colossalai/registry/registry.py
rename to colossalai/legacy/registry/registry.py
index 8a4173f7a..50d6b74c5 100644
--- a/colossalai/registry/registry.py
+++ b/colossalai/legacy/registry/registry.py
@@ -6,7 +6,7 @@ from typing import List
 
 
 class Registry:
-    """This is a registry class used to register classes and modules so that a universal 
+    """This is a registry class used to register classes and modules so that a universal
     object builder can be enabled.
 
     Args:
@@ -42,7 +42,7 @@ class Registry:
         return module_class
 
     def get_module(self, module_name: str):
-        """Retrieves a module with name `module_name` and returns the module if it has 
+        """Retrieves a module with name `module_name` and returns the module if it has
         already been registered before.
 
         Args:
diff --git a/colossalai/legacy/trainer/hooks/_checkpoint_hook.py b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
index 7754ebcc3..6b150d291 100644
--- a/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
@@ -2,9 +2,9 @@
 # -*- encoding: utf-8 -*-
 import torch
 
+from colossalai.legacy.registry import HOOKS
 from colossalai.legacy.trainer.hooks import BaseHook
 from colossalai.logging import get_dist_logger
-from colossalai.registry import HOOKS
 from colossalai.utils.checkpointing import save_checkpoint
 
 from ._lr_scheduler_hook import LRSchedulerHook
diff --git a/colossalai/legacy/trainer/hooks/_log_hook.py b/colossalai/legacy/trainer/hooks/_log_hook.py
index 1efc8be76..7d9ad19aa 100644
--- a/colossalai/legacy/trainer/hooks/_log_hook.py
+++ b/colossalai/legacy/trainer/hooks/_log_hook.py
@@ -7,9 +7,9 @@ from typing import List
 
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
+from colossalai.legacy.registry import HOOKS
 from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric
 from colossalai.logging import DistributedLogger
-from colossalai.registry import HOOKS
 from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
 
 from ._base_hook import BaseHook
diff --git a/colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py b/colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py
index 0d19ab08a..6d60966da 100644
--- a/colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py
+++ b/colossalai/legacy/trainer/hooks/_lr_scheduler_hook.py
@@ -1,6 +1,6 @@
 from torch import Tensor
 
-from colossalai.registry import HOOKS
+from colossalai.legacy.registry import HOOKS
 
 from ._metric_hook import LearningRateMetric, MetricHook
 
diff --git a/colossalai/legacy/trainer/hooks/_metric_hook.py b/colossalai/legacy/trainer/hooks/_metric_hook.py
index 96def4172..d0598c240 100644
--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@@ -10,7 +10,7 @@ import torch.distributed as dist
 from colossalai.communication import all_reduce
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import HOOKS
+from colossalai.legacy.registry import HOOKS
 from colossalai.utils import get_current_device, is_no_pp_or_last_stage
 
 from ._base_hook import BaseHook
@@ -356,7 +356,7 @@ class ThroughputMetric(Metric):
             self.last_step_num_samples *= gpc.get_world_size(ParallelMode.DATA)
         else:
             self.last_step_used_time = all_reduce(self.last_step_used_time, ParallelMode.DATA) / \
-                 gpc.get_world_size(ParallelMode.DATA)
+                gpc.get_world_size(ParallelMode.DATA)
             self.last_step_num_samples = all_reduce(self.last_step_num_samples, ParallelMode.DATA)
 
         sample_per_sec = _format_number(self.last_step_num_samples / (self.last_step_used_time + 1e-12).item())
@@ -367,7 +367,7 @@ class ThroughputMetric(Metric):
             self.last_step_num_samples *= gpc.get_world_size(ParallelMode.DATA)
         else:
             self.last_step_used_time = all_reduce(self.last_step_used_time, ParallelMode.DATA) / \
-                 gpc.get_world_size(ParallelMode.DATA)
+                gpc.get_world_size(ParallelMode.DATA)
             self.last_step_num_samples = all_reduce(self.last_step_num_samples, ParallelMode.DATA)
 
         sample_per_sec = _format_number(self.last_step_num_samples / (self.last_step_used_time + 1e-12).item())
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index 406173a18..7b129009e 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -15,8 +15,8 @@ from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.registry import LAYERS
 from colossalai.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/nn/layer/parallel_2d/layers.py
index f3a4d2bbb..1a01d5437 100644
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@@ -5,21 +5,30 @@ from typing import Callable
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import Parameter
+
 from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.registry import LAYERS
 from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
 from colossalai.utils.cuda import get_current_device
-from torch import Tensor
-from torch.nn import Parameter
 
 from ..base_layer import ParallelLayer
 from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
-from ._operation import (Matmul_AB_2D, Matmul_ABT_2D, add_bias_2d, all_gather_tensor_2d, classifier_2d, layernorm_2d,
-                         reduce_scatter_tensor_2d, split_batch_2d)
+from ._operation import (
+    Matmul_AB_2D,
+    Matmul_ABT_2D,
+    add_bias_2d,
+    all_gather_tensor_2d,
+    classifier_2d,
+    layernorm_2d,
+    reduce_scatter_tensor_2d,
+    split_batch_2d,
+)
 from ._utils import assert_summa_initialization, get_summa_dim_from_env
 
 
diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py
index f849cbbe7..62c4292fd 100644
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@@ -5,22 +5,34 @@ from typing import Callable
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import Parameter
+
 from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.registry import LAYERS
-from colossalai.utils.checkpointing import (broadcast_state_dict, gather_tensor_parallel_state_dict,
-                                            partition_tensor_parallel_state_dict)
+from colossalai.utils.checkpointing import (
+    broadcast_state_dict,
+    gather_tensor_parallel_state_dict,
+    partition_tensor_parallel_state_dict,
+)
 from colossalai.utils.cuda import get_current_device
-from torch import Tensor
-from torch.nn import Parameter
 
 from ..base_layer import ParallelLayer
 from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
-from ._operation import (Matmul_AB_2p5D, Matmul_ABT_2p5D, add_bias_2p5d, all_gather_tensor_2p5d, classifier_2p5d,
-                         layernorm_2p5d, reduce_scatter_tensor_2p5d, split_batch_2p5d)
+from ._operation import (
+    Matmul_AB_2p5D,
+    Matmul_ABT_2p5D,
+    add_bias_2p5d,
+    all_gather_tensor_2p5d,
+    classifier_2p5d,
+    layernorm_2p5d,
+    reduce_scatter_tensor_2p5d,
+    split_batch_2p5d,
+)
 from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_env
 
 
diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/nn/layer/parallel_3d/layers.py
index 99b0c3f8b..7d940aa27 100644
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@@ -13,9 +13,9 @@ from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.nn.layer.base_layer import ParallelLayer
-from colossalai.registry import LAYERS
 from colossalai.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/nn/layer/parallel_sequence/layers.py
index 0887f8389..4d0ff2e06 100644
--- a/colossalai/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/nn/layer/parallel_sequence/layers.py
@@ -2,20 +2,20 @@
 # -*- encoding: utf-8 -*-
 
 import math
-import colossalai
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import Parameter
 
+import colossalai
+from colossalai.context import seed
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_sequence._operation import RingQK, RingAV
-from colossalai.registry import LAYERS
-from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
 from colossalai.kernel import FusedScaleMaskSoftmax
-from colossalai.context import seed
+from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.registry import LAYERS
+from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
 
 
 @LAYERS.register_module
diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/nn/layer/vanilla/layers.py
index 225aed391..0e11fc4d0 100644
--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@@ -8,8 +8,8 @@ from torch import nn as nn
 from torch.nn.parameter import Parameter
 
 from colossalai.context import seed
+from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
-from colossalai.registry import LAYERS
 from colossalai.utils.cuda import get_current_device
 
 from ..utils import to_2tuple
diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/nn/loss/loss_1d.py
index dd548c1d3..8c9483fcc 100644
--- a/colossalai/nn/loss/loss_1d.py
+++ b/colossalai/nn/loss/loss_1d.py
@@ -1,105 +1,106 @@
-import torch
-import torch.distributed as dist
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.registry import LOSSES
-from torch.cuda.amp import custom_bwd, custom_fwd
-from torch.nn.modules.loss import _Loss
-
-
-class _VocabParallelCrossEntropy1D(torch.autograd.Function):
-
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, vocab_parallel_logits, targets, process_group):
-        if process_group is None:
-            process_group = gpc.get_group(ParallelMode.PARALLEL_1D)
-
-        # Maximum value along vocab dimension across all GPUs.
-        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
-        torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=process_group)
-        # Subtract the maximum value.
-        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
-
-        # Get the partition's vocab indices
-        partition_vocab_size = vocab_parallel_logits.size()[-1]
-        rank = dist.get_rank(process_group)
-        vocab_start_index = partition_vocab_size * rank
-        vocab_end_index = vocab_start_index + partition_vocab_size
-
-        # Create a mask of valid vocab ids (1 means it needs to be masked).
-        target_mask = (targets < vocab_start_index) | (targets >= vocab_end_index)
-        masked_target = targets.clone() - vocab_start_index
-        masked_target[target_mask] = 0
-
-        # Get predicted-logits = logits[target].
-        # For Simplicity, we convert logits to a 2-D tensor with size
-        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
-        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
-        masked_target_1d = masked_target.view(-1)
-        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
-        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
-        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
-        predicted_logits = predicted_logits_1d.view_as(targets)
-        predicted_logits[target_mask] = 0.0
-        # All reduce is needed to get the chunks from other GPUs.
-        torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
-
-        # Sum of exponential of logits along vocab dimension across all GPUs.
-        exp_logits = torch.exp(vocab_parallel_logits)
-        sum_exp_logits = exp_logits.sum(dim=-1)
-        torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
-
-        # Loss = log(sum(exp(logits))) - predicted-logit.
-        loss = torch.log(sum_exp_logits) - predicted_logits
-        # Store softmax, target-mask and masked-target for backward pass.
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
-        return loss
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-
-        # Retrieve tensors from the forward path.
-        softmax, target_mask, masked_target_1d = ctx.saved_tensors
-
-        # All the inputs have softmax as their gradient.
-        grad_input = softmax
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
-        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
-
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(grad_output.unsqueeze(dim=-1))
-
-        return grad_input, None, None
-
-
-@LOSSES.register_module
-class VocabParallelCrossEntropyLoss1D(_Loss):
-    """Vocab parallel cross entropy loss for 1D parallelism.
-
-    Args:
-        reduction (bool, optional): whether to average the loss, defaults to True.
-    """
-
-    def __init__(self, reduction=True):
-        super().__init__()
-        self.reduction_mean = reduction
-
-    def forward(self, logits, targets, process_group=None):
-        """Calculate loss between logits and targets.
-
-        Args:
-            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
-            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
-        """
-        loss = _VocabParallelCrossEntropy1D.apply(logits, targets, process_group)
-        if self.reduction_mean:
-            loss = loss.mean()
-        return loss
+import torch
+import torch.distributed as dist
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.nn.modules.loss import _Loss
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.registry import LOSSES
+
+
+class _VocabParallelCrossEntropy1D(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, vocab_parallel_logits, targets, process_group):
+        if process_group is None:
+            process_group = gpc.get_group(ParallelMode.PARALLEL_1D)
+
+        # Maximum value along vocab dimension across all GPUs.
+        logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
+        torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=process_group)
+        # Subtract the maximum value.
+        vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
+
+        # Get the partition's vocab indices
+        partition_vocab_size = vocab_parallel_logits.size()[-1]
+        rank = dist.get_rank(process_group)
+        vocab_start_index = partition_vocab_size * rank
+        vocab_end_index = vocab_start_index + partition_vocab_size
+
+        # Create a mask of valid vocab ids (1 means it needs to be masked).
+        target_mask = (targets < vocab_start_index) | (targets >= vocab_end_index)
+        masked_target = targets.clone() - vocab_start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
+        logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        predicted_logits_1d = predicted_logits_1d.clone().contiguous()
+        predicted_logits = predicted_logits_1d.view_as(targets)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = torch.exp(vocab_parallel_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=process_group)
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+        # Store softmax, target-mask and masked-target for backward pass.
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+        return loss
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+
+        # Retrieve tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        partition_vocab_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, partition_vocab_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None, None
+
+
+@LOSSES.register_module
+class VocabParallelCrossEntropyLoss1D(_Loss):
+    """Vocab parallel cross entropy loss for 1D parallelism.
+
+    Args:
+        reduction (bool, optional): whether to average the loss, defaults to True.
+    """
+
+    def __init__(self, reduction=True):
+        super().__init__()
+        self.reduction_mean = reduction
+
+    def forward(self, logits, targets, process_group=None):
+        """Calculate loss between logits and targets.
+
+        Args:
+            logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+        """
+        loss = _VocabParallelCrossEntropy1D.apply(logits, targets, process_group)
+        if self.reduction_mean:
+            loss = loss.mean()
+        return loss
diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/nn/loss/loss_2d.py
index 7da8b2d69..6db40c0f3 100644
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@@ -1,15 +1,16 @@
 import torch
 import torch.distributed as dist
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
-from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
-from colossalai.registry import LOSSES
-from colossalai.utils import get_current_device
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.registry import LOSSES
+from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
+from colossalai.utils import get_current_device
+
 
 @LOSSES.register_module
 class CrossEntropyLoss2D(_Loss):
diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/nn/loss/loss_2p5d.py
index 63dc4f33a..9c78a1ef0 100644
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@@ -1,15 +1,16 @@
 import torch
 import torch.distributed as dist
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
-from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
-from colossalai.registry import LOSSES
-from colossalai.utils import get_current_device
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.legacy.registry import LOSSES
+from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
+from colossalai.utils import get_current_device
+
 
 @LOSSES.register_module
 class CrossEntropyLoss2p5D(_Loss):
diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/nn/loss/loss_3d.py
index f27d57ad6..5c0f26640 100644
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@@ -1,15 +1,16 @@
 import torch
 import torch.distributed as dist
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D, OUTPUT_GROUP_3D
-from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
-from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
-from colossalai.registry import LOSSES
-from colossalai.utils import get_current_device
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
+from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.core import global_context as gpc
+from colossalai.legacy.registry import LOSSES
+from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
+from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.utils import get_current_device
+
 
 @LOSSES.register_module
 class CrossEntropyLoss3D(_Loss):
diff --git a/colossalai/nn/loss/loss_moe.py b/colossalai/nn/loss/loss_moe.py
index a8b18a3e3..40cea788c 100644
--- a/colossalai/nn/loss/loss_moe.py
+++ b/colossalai/nn/loss/loss_moe.py
@@ -1,80 +1,81 @@
-import torch.nn as nn
-from colossalai.registry import LOSSES
-from torch.nn.modules.loss import _Loss
-from colossalai.context.moe_context import MOE_CONTEXT
-
-
-@LOSSES.register_module
-class MoeCrossEntropyLoss(_Loss):
-    r"""torch.nn.CrossEntropyLoss added with auxiliary loss.
-
-    Args:
-        input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
-        target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
-        aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
-
-    The ``args`` and ``kwargs`` should include parameters below:
-    ::
-
-        weight (Tensor, optional)
-        size_average (bool, optional)
-        ignore_index (int, optional)
-        reduce (bool, optional)
-        reduction (str, optional)
-        label_smoothing (float, optional)
-
-    More details about ``args``, ``kwargs`` and ``torch.nn.functional.cross_entropy`` could be found in
-    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
-    """
-
-    def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
-        super().__init__()
-        self.loss = nn.CrossEntropyLoss(*args, **kwargs)
-        self.aux_weight = aux_weight
-
-    def forward(self, *args):
-        """
-        The ``args`` should at least include parameters below:
-        ::
-
-            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
-            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
-
-        More details about ``args``, ``kwargs`` and ``torch.nn.functional.cross_entropy`` could be found in
-        `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
-        """
-        main_loss = self.loss(*args)
-        aux_loss = MOE_CONTEXT.get_loss()
-        return main_loss + self.aux_weight * aux_loss
-
-
-@LOSSES.register_module
-class MoeLoss(_Loss):
-    """A wrapper class for any loss module to add with auxiliary loss.
-
-    Args:
-        aux_weight (float): Weight of auxiliary loss in total loss.
-        loss_fn (``Callable``): Loss function.
-        args (list): Args in loss function.
-        kwargs (dict): Kwargs in loss function
-    """
-
-    def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
-        super().__init__()
-        self.loss_fn = loss_fn(*args, **kwargs)
-        self.aux_weight = aux_weight
-
-    def forward(self, *args, **kwargs):
-        """
-        The ``args`` and ``kwargs`` should at least include parameters below:
-        ::
-
-            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
-            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
-
-        Note:
-            The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
-        """
-        main_loss = self.loss_fn(*args, **kwargs)
-        aux_loss = MOE_CONTEXT.get_loss()
-        return main_loss + self.aux_weight * aux_loss
+import torch.nn as nn
+from torch.nn.modules.loss import _Loss
+
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.registry import LOSSES
+
+
+@LOSSES.register_module
+class MoeCrossEntropyLoss(_Loss):
+    r"""torch.nn.CrossEntropyLoss added with auxiliary loss.
+
+    Args:
+        input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+        target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+        aux_weight (float, optional): Weight of auxiliary loss in total loss.Defaults 0.01.
+
+    The ``args`` and ``kwargs`` should include parameters below:
+    ::
+
+        weight (Tensor, optional)
+        size_average (bool, optional)
+        ignore_index (int, optional)
+        reduce (bool, optional)
+        reduction (str, optional)
+        label_smoothing (float, optional)
+
+    More details about ``args``, ``kwargs`` and ``torch.nn.functional.cross_entropy`` could be found in
+    `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
+    """
+
+    def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
+        super().__init__()
+        self.loss = nn.CrossEntropyLoss(*args, **kwargs)
+        self.aux_weight = aux_weight
+
+    def forward(self, *args):
+        """
+        The ``args`` should at least include parameters below:
+        ::
+
+            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+
+        More details about ``args``, ``kwargs`` and ``torch.nn.functional.cross_entropy`` could be found in
+        `Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
+        """
+        main_loss = self.loss(*args)
+        aux_loss = MOE_CONTEXT.get_loss()
+        return main_loss + self.aux_weight * aux_loss
+
+
+@LOSSES.register_module
+class MoeLoss(_Loss):
+    """A wrapper class for any loss module to add with auxiliary loss.
+
+    Args:
+        aux_weight (float): Weight of auxiliary loss in total loss.
+        loss_fn (``Callable``): Loss function.
+        args (list): Args in loss function.
+        kwargs (dict): Kwargs in loss function
+    """
+
+    def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
+        super().__init__()
+        self.loss_fn = loss_fn(*args, **kwargs)
+        self.aux_weight = aux_weight
+
+    def forward(self, *args, **kwargs):
+        """
+        The ``args`` and ``kwargs`` should at least include parameters below:
+        ::
+
+            input (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
+            target (:class:`torch.tensor`): Ground truth class indices or class probabilities.
+
+        Note:
+            The ``args`` and ``kwargs`` may include different parameters varying with different loss function.
+        """
+        main_loss = self.loss_fn(*args, **kwargs)
+        aux_loss = MOE_CONTEXT.get_loss()
+        return main_loss + self.aux_weight * aux_loss
diff --git a/colossalai/nn/lr_scheduler/cosine.py b/colossalai/nn/lr_scheduler/cosine.py
index aab523bef..0010435c2 100644
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@@ -1,6 +1,7 @@
 from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
 
-from colossalai.registry import LR_SCHEDULERS
+from colossalai.legacy.registry import LR_SCHEDULERS
+
 from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
 
 
diff --git a/colossalai/nn/lr_scheduler/linear.py b/colossalai/nn/lr_scheduler/linear.py
index 556938b8a..251779647 100644
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@@ -1,6 +1,6 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.registry import LR_SCHEDULERS
+from colossalai.legacy.registry import LR_SCHEDULERS
 
 
 @LR_SCHEDULERS.register_module
diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py
index 29531a9e3..4f18b49fc 100644
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@@ -2,7 +2,8 @@ from typing import List
 
 from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
 
-from colossalai.registry import LR_SCHEDULERS
+from colossalai.legacy.registry import LR_SCHEDULERS
+
 from .delayed import WarmupScheduler
 
 
diff --git a/colossalai/nn/lr_scheduler/onecycle.py b/colossalai/nn/lr_scheduler/onecycle.py
index 8007fd360..20e9aaec6 100644
--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@@ -1,6 +1,6 @@
 from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
 
-from colossalai.registry import LR_SCHEDULERS
+from colossalai.legacy.registry import LR_SCHEDULERS
 
 
 @LR_SCHEDULERS.register_module
diff --git a/colossalai/nn/lr_scheduler/poly.py b/colossalai/nn/lr_scheduler/poly.py
index 16352bc51..a98506423 100644
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@@ -1,6 +1,7 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.registry import LR_SCHEDULERS
+from colossalai.legacy.registry import LR_SCHEDULERS
+
 from .delayed import WarmupScheduler
 
 
diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py
index 05d2a49c1..09f5d4585 100644
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@@ -1,9 +1,9 @@
+from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
 from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR
-from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR
 
-from colossalai.registry import LR_SCHEDULERS
+from colossalai.legacy.registry import LR_SCHEDULERS
 
 
 @LR_SCHEDULERS.register_module
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 3a6d37103..210400a21 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -4,7 +4,7 @@ from typing import Optional
 import torch
 
 from colossalai.kernel.op_builder import CPUAdamBuilder
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 
 from .nvme_optimizer import NVMeOptimizer
 
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index 82a6250f1..0d13873cd 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -8,7 +8,7 @@ Licensed under the MIT License.
 '''
 import torch
 
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py
index 72520064e..48cc097c7 100644
--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@@ -1,7 +1,7 @@
 # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
 import torch
 
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py
index 468713b22..0e8d3fc10 100644
--- a/colossalai/nn/optimizer/fused_sgd.py
+++ b/colossalai/nn/optimizer/fused_sgd.py
@@ -2,7 +2,7 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
 
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 
diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py
index 84903ac36..7aa0ced18 100644
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -4,7 +4,7 @@ import torch
 from torch.optim import Adam
 
 from colossalai.kernel.op_builder import FusedOptimBuilder
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 from colossalai.utils import multi_tensor_applier
 
 from .cpu_adam import CPUAdam
diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py
index 399ad39b6..769c11f62 100644
--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
@@ -5,7 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-
 import torch
 from torch.optim import Optimizer
 
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 
 
 @OPTIMIZERS.register_module
diff --git a/colossalai/nn/optimizer/lars.py b/colossalai/nn/optimizer/lars.py
index 212f66671..9dbb83b84 100644
--- a/colossalai/nn/optimizer/lars.py
+++ b/colossalai/nn/optimizer/lars.py
@@ -5,7 +5,7 @@ from typing import Iterable
 import torch
 from torch.optim import Optimizer
 
-from colossalai.registry import OPTIMIZERS
+from colossalai.legacy.registry import OPTIMIZERS
 
 
 @OPTIMIZERS.register_module
@@ -22,28 +22,24 @@ class Lars(Optimizer):
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
     """
 
-    def __init__(
-            self,
-            params: Iterable[torch.nn.Parameter],
-            lr=1e-3,
-            momentum=0,
-            eeta=1e-3,
-            weight_decay=0,
-            epsilon=0.0
-    ) -> None:
+    def __init__(self,
+                 params: Iterable[torch.nn.Parameter],
+                 lr=1e-3,
+                 momentum=0,
+                 eeta=1e-3,
+                 weight_decay=0,
+                 epsilon=0.0) -> None:
         if not isinstance(lr, float) or lr < 0.0:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if momentum < 0.0:
             raise ValueError("Invalid momentum value: {}".format(momentum))
         if weight_decay < 0.0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay))
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if eeta <= 0 or eeta > 1:
             raise ValueError("Invalid eeta value: {}".format(eeta))
         if epsilon < 0:
             raise ValueError("Invalid epsilon value: {}".format(epsilon))
-        defaults = dict(lr=lr, momentum=momentum,
-                        weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
+        defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
 
         super().__init__(params, defaults)
 
@@ -76,11 +72,9 @@ class Lars(Optimizer):
                 if lars:
                     w_norm = torch.norm(p)
                     g_norm = torch.norm(p.grad)
-                    trust_ratio = torch.where(
-                        w_norm > 0 and g_norm > 0,
-                        eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
-                        torch.ones_like(w_norm)
-                    )
+                    trust_ratio = torch.where(w_norm > 0 and g_norm > 0,
+                                              eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
+                                              torch.ones_like(w_norm))
                     trust_ratio.clamp_(0.0, 50)
                     scaled_lr *= trust_ratio.item()
                     if weight_decay != 0:
@@ -90,8 +84,7 @@ class Lars(Optimizer):
                 if momentum != 0:
                     param_state = self.state[p]
                     if 'momentum_buffer' not in param_state:
-                        buf = param_state['momentum_buffer'] = torch.clone(
-                            decayed_grad).detach()
+                        buf = param_state['momentum_buffer'] = torch.clone(decayed_grad).detach()
                     else:
                         buf = param_state['momentum_buffer']
                         buf.mul_(momentum).add_(decayed_grad)
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
index 2318e07a7..4ca7bce7b 100644
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -4,15 +4,15 @@
 
 import math
 import random
-import numpy as np
-from typing import TypeVar, Iterator
+from typing import Iterator, TypeVar
 
+import numpy as np
 import torch
-from torch.utils.data import Sampler, Dataset, DataLoader
+from torch.utils.data import DataLoader, Dataset, Sampler
 
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.registry import DATA_SAMPLERS
+from colossalai.legacy.registry import DATA_SAMPLERS
 
 T_co = TypeVar('T_co', covariant=True)
 
@@ -30,11 +30,7 @@ class DataParallelSampler(Sampler):
             the batch size, then the last batch will be smaller, defaults to False.
     """
 
-    def __init__(self,
-                 dataset: Dataset,
-                 shuffle: bool = False,
-                 seed: int = 0,
-                 drop_last: bool = False) -> None:
+    def __init__(self, dataset: Dataset, shuffle: bool = False, seed: int = 0, drop_last: bool = False) -> None:
         self.dataset = dataset
         self.num_replicas = gpc.get_world_size(ParallelMode.DATA)
         self.rank = gpc.get_local_rank(ParallelMode.DATA)
@@ -54,8 +50,7 @@ class DataParallelSampler(Sampler):
                 self.num_replicas  # type: ignore[arg-type]
             )
         else:
-            self.num_samples = math.ceil(
-                len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)    # type: ignore[arg-type]
         self.total_size = self.num_samples * self.num_replicas
         self.shuffle = shuffle
         self.seed = seed
@@ -72,7 +67,7 @@ class DataParallelSampler(Sampler):
             # set_epoch manually
             self.epoch += 1
         else:
-            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+            indices = list(range(len(self.dataset)))    # type: ignore[arg-type]
 
         if not self.drop_last:
             # add extra samples to make it evenly divisible
@@ -80,8 +75,7 @@ class DataParallelSampler(Sampler):
             if padding_size <= len(indices):
                 indices += indices[:padding_size]
             else:
-                indices += (indices * math.ceil(padding_size /
-                            len(indices)))[:padding_size]
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
         else:
             # remove tail of data to make it evenly divisible.
             indices = indices[:self.total_size]
@@ -109,8 +103,8 @@ class DataParallelSampler(Sampler):
 
 def get_dataloader(dataset,
                    shuffle=False,
-                   seed=1024, 
-                   add_sampler=True, 
+                   seed=1024,
+                   add_sampler=True,
                    drop_last=False,
                    pin_memory=False,
                    num_workers=0,
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
index 8f8fec649..d68a9dc64 100644
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.registry import OPHOOKS
+from colossalai.legacy.registry import OPHOOKS
 
 from . import BaseOpHook
 
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
index a2a62fb97..6b76a2116 100644
--- a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
+++ b/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.registry import OPHOOKS
+from colossalai.legacy.registry import OPHOOKS
 
 from . import BaseOpHook
 
diff --git a/colossalai/zero/legacy/sharded_model/zero_hook.py b/colossalai/zero/legacy/sharded_model/zero_hook.py
index 50f4bdfc7..1815bee3a 100644
--- a/colossalai/zero/legacy/sharded_model/zero_hook.py
+++ b/colossalai/zero/legacy/sharded_model/zero_hook.py
@@ -3,8 +3,8 @@ from typing import Optional
 import torch
 import torch.distributed as dist
 
+from colossalai.legacy.registry import OPHOOKS
 from colossalai.logging import get_dist_logger
-from colossalai.registry import OPHOOKS
 from colossalai.utils import get_current_device
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector
 from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md
index cda49af47..384221596 100644
--- a/docs/source/en/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/en/advanced_tutorials/add_your_parallel.md
@@ -98,7 +98,7 @@ parallel gradient handler is added to the engine automatically if data parallel
 gradient handler like below:
 
 ```python
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.legacy.engine import BaseGradientHandler
 
 @GRADIENT_HANDLER.register_module
diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 98c16e922..5aa806c64 100644
--- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -36,7 +36,7 @@ import torch
 import torch.nn as nn
 from colossalai import nn as col_nn
 from colossalai.amp import AMP_TYPE
-from colossalai.builder.pipeline import partition_uniform
+from colossalai.legacy.builder.pipeline import partition_uniform
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
index 370931d87..6dbe33800 100644
--- a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -34,7 +34,7 @@ import colossalai
 import colossalai.nn as col_nn
 import torch
 import torch.nn as nn
-from colossalai.builder import build_pipeline_model
+from colossalai.legacy.builder import build_pipeline_model
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
@@ -51,17 +51,17 @@ from torchvision.datasets import CIFAR10
 
 Generally, we provide 3 ways to build a pipelined model:
 
-1. `colossalai.builder.build_pipeline_model_from_cfg`
-2. `colossalai.builder.build_pipeline_model`
+1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
+2. `colossalai.legacy.builder.build_pipeline_model`
 3. Split the model by stages by yourself
 
 When your memory can fit the model, you can use the first two methods to build your model, otherwise you must split the model by yourself. The first two methods first build the whole model on CPU, then split the model, and finally you can just move the corresponding part of model to GPU.
 
-`colossalai.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
+`colossalai.legacy.builder.build_pipeline_model_from_cfg()` receives a config file of model, and it can split the model uniformly (by layer) or balanced (by parameter size).
 
-If you are familiar with `PyTorch`, you can use  `colossalai.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
+If you are familiar with `PyTorch`, you can use  `colossalai.legacy.builder.build_pipeline_model()` which receives a `torch.nn.Sequential` model and split it by layer uniformly.
 
-In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.builder.build_pipeline_model()` to build the pipelined model.
+In this tutorial, we will modify [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential` and then use `colossalai.legacy.builder.build_pipeline_model()` to build the pipelined model.
 
 When the data is **one** `Tensor`, you can use the positional argument in `forward()` of your model to get the data tensor. For the first stage of pipeline, the first positional argument of `forward()` is the data tensor loaded from data loader. For other stages, the first positional argument of `forward()` is the output tensor from the previous stage. Note that if the stage is not the last stage, the return of `forward()` must be a `Tensor`.
 
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index fc1101c5a..22022639c 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -273,8 +273,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
 
 ### Build pipeline model (`/hybrid_parallel/model/vit.py`)
 Colossal-AI provides two methods to build a pipeline model from the existing model.
-- `colossalai.builder.build_pipeline_model_from_cfg`
-- `colossalai.builder.build_pipeline_model`
+- `colossalai.legacy.builder.build_pipeline_model_from_cfg`
+- `colossalai.legacy.builder.build_pipeline_model`
 
 Besides, you can also build a pipeline model from scratch with Colossal-AI.
 ```python
@@ -284,11 +284,11 @@ from typing import Callable
 import inspect
 import torch
 from colossalai import nn as col_nn
-from colossalai.registry import LAYERS, MODELS
+from colossalai.legacy.registry import LAYERS, MODELS
 from colossalai.logging import get_dist_logger
 from colossalai.core import global_context as gpc
 from colossalai.context import ParallelMode
-from colossalai.builder.pipeline import partition_uniform
+from colossalai.legacy.builder.pipeline import partition_uniform
 from torch import dtype, nn
 from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
 
diff --git a/docs/source/en/features/gradient_handler.md b/docs/source/en/features/gradient_handler.md
index 14ced32b8..66e5e3a9d 100644
--- a/docs/source/en/features/gradient_handler.md
+++ b/docs/source/en/features/gradient_handler.md
@@ -28,7 +28,7 @@ To implement a customized gradient handler, you need to follow these steps.
 3. implement `handle_gradient` method.
 
 ```python
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 
 
diff --git a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
index abfe058c6..c4b0f6557 100644
--- a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
@@ -87,7 +87,7 @@ Colossal-AI 为用户提供了一个全局 context，使他们能够轻松地管
 你可以添加你自己的梯度 handler，如下所示：
 
 ```python
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.legacy.engine import BaseGradientHandler
 
 @GRADIENT_HANDLER.register_module
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 84b48165b..9cfbf5873 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -36,7 +36,7 @@ import torch
 import torch.nn as nn
 from colossalai import nn as col_nn
 from colossalai.amp import AMP_TYPE
-from colossalai.builder.pipeline import partition_uniform
+from colossalai.legacy.builder.pipeline import partition_uniform
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
index 1ac01c207..5ef863dcd 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -32,7 +32,7 @@ import colossalai
 import colossalai.nn as col_nn
 import torch
 import torch.nn as nn
-from colossalai.builder import build_pipeline_model
+from colossalai.legacy.builder import build_pipeline_model
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
@@ -48,17 +48,17 @@ from torchvision.datasets import CIFAR10
 
 总的来说, 我们提供3种方法来建立一个流水并行的模型:
 
-1. `colossalai.builder.build_pipeline_model_from_cfg`
-2. `colossalai.builder.build_pipeline_model`
+1. `colossalai.legacy.builder.build_pipeline_model_from_cfg`
+2. `colossalai.legacy.builder.build_pipeline_model`
 3. 自己按阶段拆分模型
 
 当你的内存能够容纳模型时，你可以使用前两种方法来建立你的模型，否则你必须自己分割模型。前两种方法首先在 CPU 上建立整个模型，然后分割模型，最后你可以直接把模型的相应部分移到 GPU 上。
 
-`colossalai.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件，它可以均匀地（按层）或平衡地（按参数大小）分割模型。
+`colossalai.legacy.builder.build_pipeline_model_from_cfg()` 接收一个模型的配置文件，它可以均匀地（按层）或平衡地（按参数大小）分割模型。
 
-如果你熟悉 `PyTorch`, 你可以使用 `colossalai.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。
+如果你熟悉 `PyTorch`, 你可以使用 `colossalai.legacy.builder.build_pipeline_model()` 它接收一个 `torch.nn.Sequential` 模型并按层均匀分割。
 
-在本教程中，我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`，然后使用 `colossalai.builder.build_pipeline_model()` 来建立流水线模型。
+在本教程中，我们将修改 [TIMM/ViT](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to `torch.nn.Sequential`，然后使用 `colossalai.legacy.builder.build_pipeline_model()` 来建立流水线模型。
 
 当数据是 **一个** `Tensor`, 你可以使用你的模型 `forward()` 中的位置参数来获得数据张量。对于流水线的第一阶段，`forward()` 的第一个位置参数是从数据加载器加载的数据张量。对于其他阶段，`forward()` 的第一个位置参数是上一阶段的输出张量。注意，如果该阶段不是最后一个阶段，则 `forward()` 的返回必须是一个 `Tensor`。
 
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 650bab105..803882a5a 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -256,8 +256,8 @@ SEQ_LENGTH = (IMG_SIZE // PATCH_SIZE) ** 2 + 1  # add 1 for cls token
 
 ### 构建流水线模型 (`/hybrid_parallel/model/vit.py`)
 Colossal-AI 提供了两种从现有模型构建流水线模型的方法。
-- `colossalai.builder.build_pipeline_model_from_cfg`
-- `colossalai.builder.build_pipeline_model`
+- `colossalai.legacy.builder.build_pipeline_model_from_cfg`
+- `colossalai.legacy.builder.build_pipeline_model`
 
 此外，您还可以使用 Colossal-AI 从头开始构建流水线模型。
 ```python
@@ -266,11 +266,11 @@ from typing import Callable
 import inspect
 import torch
 from colossalai import nn as col_nn
-from colossalai.registry import LAYERS, MODELS
+from colossalai.legacy.registry import LAYERS, MODELS
 from colossalai.logging import get_dist_logger
 from colossalai.core import global_context as gpc
 from colossalai.context import ParallelMode
-from colossalai.builder.pipeline import partition_uniform
+from colossalai.legacy.builder.pipeline import partition_uniform
 from torch import dtype, nn
 from model_zoo.vit.vit import ViTBlock, ViTEmbedding, ViTHead
 @MODELS.register_module
diff --git a/docs/source/zh-Hans/features/gradient_handler.md b/docs/source/zh-Hans/features/gradient_handler.md
index b08dd6806..3b1140409 100644
--- a/docs/source/zh-Hans/features/gradient_handler.md
+++ b/docs/source/zh-Hans/features/gradient_handler.md
@@ -25,7 +25,7 @@
 3. 实现 `handle_gradient`
 
 ```python
-from colossalai.registry import GRADIENT_HANDLER
+from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 
 
diff --git a/examples/language/gpt/titans/dataset/webtext.py b/examples/language/gpt/titans/dataset/webtext.py
index 64f5944a9..fdfc57e9b 100644
--- a/examples/language/gpt/titans/dataset/webtext.py
+++ b/examples/language/gpt/titans/dataset/webtext.py
@@ -6,7 +6,7 @@ import torch
 from torch.utils.data import Dataset
 from transformers import GPT2Tokenizer
 
-from colossalai.registry import DATASETS
+from colossalai.legacy.registry import DATASETS
 
 
 @DATASETS.register_module
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
index d825ae92a..668992901 100644
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter
 
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
+from colossalai.legacy.registry import LAYERS, LOSSES, MODELS
 from colossalai.nn.layer.base_layer import ParallelLayer
 from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
 from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row
 from colossalai.nn.layer.utils import divide
-from colossalai.registry import LAYERS, LOSSES, MODELS
 from colossalai.utils import get_current_device