diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index a3df2c50e..f1e7a2d0c 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -89,7 +89,7 @@ jobs:
       - name: Install ColossalAI
         run: |
           source activate pytorch
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
 
       - name: Test the Doc
         run: |
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index 6b4f5d1f9..027fbfd0a 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install ColossalAI
         run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
 
       - name: Install Doc Test Requirements
         run: |
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
index 620d4771a..9d3bd9a48 100644
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -53,7 +53,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Install Colossal-AI
         run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
       - name: Test the example
         run: |
           dir=${{ matrix.directory }}
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index ec23b9d1c..5934704f4 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -88,7 +88,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
 
       - name: Test the example
         run: |
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index bd52ca432..5ed128c3e 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -42,7 +42,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
 
       - name: Traverse all files
         run: |
diff --git a/colossalai/__init__.py b/colossalai/__init__.py
index f859161f7..fa6f72a60 100644
--- a/colossalai/__init__.py
+++ b/colossalai/__init__.py
@@ -1,11 +1,4 @@
-from .initialize import (
-    get_default_parser,
-    initialize,
-    launch,
-    launch_from_openmpi,
-    launch_from_slurm,
-    launch_from_torch,
-)
+from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
 
 try:
     # .version will be created by setup.py
@@ -15,3 +8,5 @@ except ModuleNotFoundError:
     # and directly set PYTHONPATH to use Colossal-AI which is a bad practice
     __version__ = '0.0.0'
     print('please install Colossal-AI from https://www.colossalai.org/download or from source')
+
+__all__ = ['launch', 'launch_from_openmpi', 'launch_from_slurm', 'launch_from_torch', '__version__']
diff --git a/colossalai/amp/__init__.py b/colossalai/amp/__init__.py
index 963215476..e69de29bb 100644
--- a/colossalai/amp/__init__.py
+++ b/colossalai/amp/__init__.py
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch.nn as nn
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-
-from colossalai.context import Config
-
-from .amp_type import AMP_TYPE
-from .apex_amp import convert_to_apex_amp
-from .naive_amp import convert_to_naive_amp
-from .torch_amp import convert_to_torch_amp
-
-__all__ = ['convert_to_amp', 'convert_to_naive_amp', 'convert_to_apex_amp', 'convert_to_torch_amp', 'AMP_TYPE']
-
-
-def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
-    """A helper function to wrap training components with Torch AMP modules.
-
-    Args:
-        param model (:class:`torch.nn.Module`): your model object.
-        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
-        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
-        mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
-        amp_config (Union[:class:`colossalai.context.Config`, dict]): configuration for different amp modes.
-
-    Returns:
-        A tuple (model, optimizer, criterion).
-
-    Note:
-        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
-        for more details about ``amp_config``.
-        For ``apex_amp``, please check
-        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
-        For ``naive_amp``, please check
-        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
-        For ``torch_amp``, please check
-        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
-    """
-    assert isinstance(mode, AMP_TYPE), \
-        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
-
-    if amp_config is None:
-        amp_config = Config()
-
-    if mode == AMP_TYPE.TORCH:
-        model, optimizer, criterion = convert_to_torch_amp(model, optimizer, criterion, amp_config)
-    elif mode == AMP_TYPE.APEX:
-        model, optimizer = convert_to_apex_amp(model, optimizer, amp_config)
-    elif mode == AMP_TYPE.NAIVE:
-        model, optimizer = convert_to_naive_amp(model, optimizer, amp_config)
-
-    return model, optimizer, criterion
diff --git a/colossalai/amp/naive_amp/__init__.py b/colossalai/amp/naive_amp/__init__.py
index 5b2f71d3c..e69de29bb 100644
--- a/colossalai/amp/naive_amp/__init__.py
+++ b/colossalai/amp/naive_amp/__init__.py
@@ -1,60 +0,0 @@
-import inspect
-
-import torch.nn as nn
-from torch.optim import Optimizer
-
-from colossalai.utils import is_no_pp_or_last_stage
-
-from ._fp16_optimizer import FP16Optimizer
-from .grad_scaler import ConstantGradScaler, DynamicGradScaler
-from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer
-
-
-def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with naive AMP modules. In this mode,
-    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
-    which is equivalent to Apex O3.
-
-    Args:
-        model (:class:`torch.nn.Module`): your model object
-        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
-        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
-
-    Returns:
-        Tuple: A tuple (model, optimizer)
-
-    The ``amp_config`` should contain parameters below::
-
-        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
-        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
-                                          Note that clipping is ignored if clip_grad == 0.
-        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
-    """
-    if isinstance(model, nn.ModuleList):
-        # interleaved pipeline
-        module_list = []
-        for chunk, m in enumerate(model):
-            output_to_fp32 = is_no_pp_or_last_stage() and chunk == len(model) - 1
-            module_list.append(NaiveAMPModel(m, output_to_fp32=output_to_fp32))
-        model = nn.ModuleList(module_list)
-    else:
-        output_to_fp32 = is_no_pp_or_last_stage()
-        model = NaiveAMPModel(model, output_to_fp32=output_to_fp32)
-
-    use_dynamic_grad_scaler = amp_config.pop('dynamic_grad_scale', True)
-    if use_dynamic_grad_scaler:
-        scaler_class = DynamicGradScaler
-    else:
-        scaler_class = ConstantGradScaler
-
-    sig = inspect.signature(scaler_class.__init__)
-    kwargs = dict()
-    for param in sig.parameters.values():
-        if param.name in amp_config:
-            kwargs[param.name] = amp_config.pop(param.name)
-    grad_scaler = scaler_class(**kwargs)
-    optimizer = NaiveAMPOptimizer(optimizer, grad_scaler, **amp_config)
-    return model, optimizer
-
-
-__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
diff --git a/colossalai/auto_parallel/offload/amp_optimizer.py b/colossalai/auto_parallel/offload/amp_optimizer.py
index 19d85b80d..353133bd6 100644
--- a/colossalai/auto_parallel/offload/amp_optimizer.py
+++ b/colossalai/auto_parallel/offload/amp_optimizer.py
@@ -5,8 +5,8 @@ import torch
 from torch.optim import Optimizer
 
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import get_current_device
 
 from .base_offload_module import BaseOffloadModule
@@ -19,7 +19,7 @@ class OptimState(Enum):
     UNSCALED = 1
 
 
-class AMPOptimizer(ColossalaiOptimizer):
+class AMPOptimizer(OptimizerWrapper):
     """
     A wrapper for Optimizer.
     Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index 3441eca38..664ac63e4 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -13,7 +13,6 @@ import torch.nn as nn
 from torch.optim import Optimizer
 
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.tensor.d_tensor import (
     is_customized_distributed_tensor,
     is_distributed_tensor,
@@ -130,10 +129,7 @@ def unwrap_optimizer(optimizer: OptimizerWrapper):
     This method should be used before saving/loading it to/from sharded checkpoints.
     '''
 
-    # TODO(Baizhou): ColossalaiOptimizer will be replaced with OptimizerWrapper in the future
     unwrapped_optim = optimizer.optim
-    if isinstance(unwrapped_optim, ColossalaiOptimizer):
-        unwrapped_optim = unwrapped_optim.optim
     return unwrapped_optim
 
 
diff --git a/colossalai/cli/benchmark/__init__.py b/colossalai/cli/benchmark/__init__.py
deleted file mode 100644
index 618ff8c61..000000000
--- a/colossalai/cli/benchmark/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import click
-
-from colossalai.context import Config
-
-from .benchmark import run_benchmark
-from .utils import *
-
-__all__ = ['benchmark']
-
-
-@click.command()
-@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
-@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
-@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
-@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
-@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
-@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
-@click.option("-l", "--layers", type=int, default=2)
-@click.option("-m",
-              "--model",
-              type=click.Choice(['mlp'], case_sensitive=False),
-              default='mlp',
-              help="Select the model to benchmark, currently only supports MLP")
-def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
-              layers: int, model: str):
-    args_dict = locals()
-    args = Config(args_dict)
-    run_benchmark(args)
diff --git a/colossalai/cli/benchmark/benchmark.py b/colossalai/cli/benchmark/benchmark.py
deleted file mode 100644
index 97a9f4572..000000000
--- a/colossalai/cli/benchmark/benchmark.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from functools import partial
-from typing import Dict, List
-
-import click
-import torch.multiprocessing as mp
-
-import colossalai
-from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
-from colossalai.context import Config
-from colossalai.context.random import reset_seeds
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.testing import free_port
-from colossalai.utils import MultiTimer
-
-from .models import MLP
-
-
-def run_benchmark(args: Config) -> None:
-    """
-    Run benchmarking with torch.multiprocessing.
-    """
-
-    # sanity checks
-    if args.gpus is None:
-        click.echo("Error: --num_gpus is not given")
-        exit()
-    if args.gpus <= 1:
-        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
-
-    click.echo("=== Benchmarking Parameters ===")
-    for k, v in args.items():
-        click.echo(f'{k}: {v}')
-    click.echo('')
-
-    config_list = find_all_configs(args.gpus)
-
-    avail_ports = [free_port() for _ in range(len(config_list))]
-    run_func = partial(run_dist_profiling,
-                       world_size=args.gpus,
-                       port_list=avail_ports,
-                       config_list=config_list,
-                       hyperparams=args)
-    mp.spawn(run_func, nprocs=args.gpus)
-
-
-def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
-                       hyperparams: Config) -> None:
-    """
-    A function executed for profiling, this function should be spawn by torch.multiprocessing.
-
-    Args:
-        rank (int): rank of the process
-        world_size (int): the number of processes
-        port_list (List[int]): a list of free ports for initializing distributed networks
-        config_list (List[Dict]): a list of configuration
-        hyperparams (Config): the hyperparameters given by the user
-
-    """
-
-    # disable logging for clean output
-    disable_existing_loggers()
-    logger = get_dist_logger()
-    logger.set_level('WARNING')
-
-    for config, port in zip(config_list, port_list):
-        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-        timer = MultiTimer()
-
-        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
-        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
-            click.echo(
-                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
-            )
-            continue
-
-        if hyperparams.model == 'mlp':
-            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
-        else:
-            if gpc.get_global_rank() == 0:
-                click.echo("Error: Invalid argument for --model")
-                exit()
-
-        data_func = partial(get_batch_data,
-                            dim=hyperparams.dimension,
-                            batch_size=hyperparams.batch_size,
-                            seq_length=hyperparams.seq_len,
-                            mode=config.parallel.tensor.mode)
-
-        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
-                                                                      warmup_steps=hyperparams.warmup_steps,
-                                                                      profile_steps=hyperparams.profile_steps,
-                                                                      data_func=data_func,
-                                                                      timer=timer)
-
-        gpc.destroy()
-        reset_seeds()
-
-        if gpc.get_global_rank() == 0:
-            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
-            click.echo(f"=== {config_str} ===")
-            click.echo(f"Average forward time: {fwd_time}")
-            click.echo(f"Average backward time: {bwd_time}")
-            click.echo(f"Max allocated GPU memory: {max_allocated}")
-            click.echo(f"Max cached GPU memory: {max_cached}\n")
diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py
deleted file mode 100644
index 385b485b6..000000000
--- a/colossalai/cli/benchmark/models.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import torch
-
-import colossalai.legacy.nn as col_nn
-
-
-class MLP(torch.nn.Module):
-
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-
-        for _ in range(layers):
-            self.layers.append(col_nn.Linear(dim, dim))
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
diff --git a/colossalai/cli/benchmark/utils.py b/colossalai/cli/benchmark/utils.py
deleted file mode 100644
index ee7d92d6e..000000000
--- a/colossalai/cli/benchmark/utils.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import math
-import time
-from typing import Callable, Dict, List, Tuple
-
-import torch
-
-from colossalai.context import Config, ParallelMode
-from colossalai.utils import MultiTimer
-
-
-def get_time_stamp() -> int:
-    """
-    Return the time stamp for profiling.
-
-    Returns:
-        time_stamp (int): the time given by time.time()
-    """
-
-    torch.cuda.synchronize()
-    time_stamp = time.time()
-    return time_stamp
-
-
-def get_memory_states() -> Tuple[float]:
-    """
-    Return the memory statistics.
-
-    Returns:
-        max_allocated (float): the allocated CUDA memory
-        max_cached (float):  the cached CUDA memory
-    """
-
-    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
-    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    return max_allocated, max_cached
-
-
-def find_all_configs(device_cnt: int) -> List[Dict]:
-    """
-    Find all possible configurations for tensor parallelism
-
-    Args:
-        device_cnt (int): the number of devices
-
-    Returns:
-        config_list (List[Dict]): a list of configurations
-    """
-
-    def _is_square(num):
-        # 2D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(math.sqrt(num))**2 == num
-
-    def _is_cube(num):
-        # 3D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(num**(1. / 3.))**3 == num
-
-    config_list = []
-
-    # add non-parallel config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
-    config_list.append(config)
-
-    # add 1D config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
-    config_list.append(config)
-
-    # add 2D config only if device_cnt is a square
-    if _is_square(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
-        config_list.append(config)
-
-    # check for 2.5D
-    # iterate over depth
-    for depth in range(1, device_cnt):
-        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
-            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
-            config_list.append(config)
-
-    # check for 3D if device_cnt is a cube
-    if _is_cube(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
-        config_list.append(config)
-
-    config_list = [Config(cfg) for cfg in config_list]
-    return config_list
-
-
-def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
-                  timer: MultiTimer) -> Tuple[float]:
-    """
-    Profile the forward and backward of a model
-
-    Args:
-        model (torch.nn.Module): a PyTorch model
-        warmup_steps (int): the number of steps for warmup
-        profile_steps (int): the number of steps for profiling
-        data_func (Callable): a function to generate random data
-        timer (colossalai.utils.Multitimer): a timer instance for time recording
-
-    Returns:
-        fwd_time (float): the average forward time taken by forward pass in second
-        bwd_time (float): the average backward time taken by forward pass in second
-        max_allocated (float): the maximum GPU memory allocated in GB
-        max_cached (float): the maximum GPU memory cached in GB
-    """
-
-    def _run_step(data):
-        timer.start('forward')
-        out = model(data)
-        timer.stop('forward', keep_in_history=True)
-        timer.start('backward')
-        out.mean().backward()
-        timer.stop('backward', keep_in_history=True)
-
-    data_list = [data_func() for _ in range(warmup_steps)]
-    for data in data_list:
-        _run_step(data)
-    timer.reset('forward')
-    timer.reset('backward')
-
-    for _ in range(profile_steps):
-        data = data_func()
-        _run_step(data)
-
-    max_allocated, max_cached = get_memory_states()
-    fwd_time = timer.get_timer('forward').get_history_mean()
-    bwd_time = timer.get_timer('backward').get_history_mean()
-    return fwd_time, bwd_time, max_allocated, max_cached
-
-
-def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
-    """
-    Return a random data of shape (batch_size, seq_length, dim) for profiling.
-
-    Args:
-        dim (int): hidden size
-        batch_size (int): the number of data samples
-        seq_length (int): the number of tokens
-        mode (ParallelMode): Colossal-AI ParallelMode enum
-
-    Returns:
-        data (torch.Tensor): random data
-    """
-
-    if mode in ['2d', '2.5d']:
-        batch_size = batch_size // 2
-        dim = dim // 2
-    elif mode == '3d':
-        batch_size = batch_size // 4
-        dim = dim // 2
-
-    data = torch.rand(batch_size, seq_length, dim).cuda()
-    return data
diff --git a/colossalai/cli/cli.py b/colossalai/cli/cli.py
index a94e1150e..0dea7c504 100644
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
@@ -1,6 +1,5 @@
 import click
 
-from .benchmark import benchmark
 from .check import check
 from .launcher import run
 
@@ -19,7 +18,6 @@ def cli():
 
 cli.add_command(run)
 cli.add_command(check)
-cli.add_command(benchmark)
 
 if __name__ == '__main__':
     cli()
diff --git a/colossalai/context/__init__.py b/colossalai/context/__init__.py
index 50178b5fa..eb6d5d05a 100644
--- a/colossalai/context/__init__.py
+++ b/colossalai/context/__init__.py
@@ -1,6 +1,8 @@
 from .config import Config, ConfigException
-from .parallel_context import ParallelContext
-from .parallel_mode import ParallelMode
-from .moe_context import MOE_CONTEXT
-from .process_group_initializer import *
-from .random import *
+
+# from .moe_context import MOE_CONTEXT
+
+__all__ = [
+    'Config',
+    'ConfigException',
+]
diff --git a/colossalai/context/moe_context.py b/colossalai/context/moe_context.py
index b41f4072a..b6e3b5201 100644
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@@ -3,13 +3,12 @@ from typing import Tuple
 import torch
 import torch.distributed as dist
 
-from colossalai.context.parallel_mode import ParallelMode
 from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
 
 
 def _check_sanity():
-    from colossalai.core import global_context as gpc
+    from colossalai.legacy.core import global_context as gpc
     if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
         raise NotImplementedError("Moe is not compatible with tensor or "
                                   "pipeline parallel at present.")
@@ -61,7 +60,7 @@ class MoeContext(metaclass=SingletonMeta):
 
         self.world_size = dist.get_world_size()
 
-        from colossalai.core import global_context as gpc
+        from colossalai.legacy.core import global_context as gpc
         self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
         assert self.world_size % self.max_ep_size == 0, \
             "Maximum expert parallel size must be a factor of the number of GPUs"
diff --git a/colossalai/core.py b/colossalai/core.py
deleted file mode 100644
index 153247bbe..000000000
--- a/colossalai/core.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from colossalai.context.parallel_context import global_context
-
-__all__ = ['global_context']
\ No newline at end of file
diff --git a/colossalai/fx/passes/shard_1d_pass.py b/colossalai/fx/passes/shard_1d_pass.py
index d2bad06bb..ccbab0c38 100644
--- a/colossalai/fx/passes/shard_1d_pass.py
+++ b/colossalai/fx/passes/shard_1d_pass.py
@@ -1,9 +1,11 @@
+import operator
+
 import torch
 import torch.nn as nn
-import operator
-from colossalai.tensor import ProcessGroup
-from colossalai.tensor.distspec import ShardSpec
-from colossalai.tensor.compute_spec import ComputePattern, ComputeSpec
+
+from colossalai.legacy.tensor import ProcessGroup
+from colossalai.legacy.tensor.compute_spec import ComputePattern, ComputeSpec
+from colossalai.legacy.tensor.distspec import ShardSpec
 
 ELEMENTWISE_MODULE_OP = [torch.nn.Dropout, torch.nn.ReLU]
 ELEMENTWISE_FUNC_OP = [
@@ -13,7 +15,7 @@ ELEMENTWISE_FUNC_OP = [
 
 
 def weight_split(weight: torch.nn.parameter.Parameter, dim: int, col_normal: bool) -> torch.nn.parameter.Parameter:
-    """weight_split 
+    """weight_split
     split a nn.Parameter
 
     Args:
@@ -60,9 +62,9 @@ def row_shard_linear_pass(gm: torch.fx.GraphModule):
 
 def transformer_mlp_pass(graph_module: torch.fx.GraphModule, process_group: ProcessGroup):
     """
-    This IR pass checks for transformer MLP like structure and annotate column and row sharding to the linear layers. 
+    This IR pass checks for transformer MLP like structure and annotate column and row sharding to the linear layers.
     """
-    #TODO: Needs to handle special cases, like x = linear(x) + linear(x)
+    # TODO: Needs to handle special cases, like x = linear(x) + linear(x)
     graph = graph_module.graph
     world_size = process_group.world_size()
 
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index a1694e059..b8718abc8 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -1,58 +1,17 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-import argparse
 import os
-import pprint
+import warnings
 from pathlib import Path
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Union
 
 import torch
-import torch.nn as nn
-from torch.nn.modules.loss import _Loss
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.optim.optimizer import Optimizer
-from torch.utils.data import DataLoader
+import torch.distributed as dist
 
-from colossalai.amp import AMP_TYPE, convert_to_amp
-from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.context import Config, ConfigException, ParallelMode
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.core import global_context as gpc
-from colossalai.legacy.builder.builder import build_gradient_handler
-from colossalai.legacy.engine import Engine
-from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
-from colossalai.legacy.engine.schedule import (
-    InterleavedPipelineSchedule,
-    NonPipelineSchedule,
-    PipelineSchedule,
-    get_tensor_shape,
-)
+from colossalai.context import Config
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
-from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
-from colossalai.utils.moe import sync_moe_model_param
-from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
-
-
-def get_default_parser():
-    """Reads user command line and uses an argument parser to parse the input arguments.
-    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
-
-    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', type=str, help='path to the config file')
-    parser.add_argument('--host', type=str, help='the master address for distributed training')
-    parser.add_argument('--port', type=int, help='the master port for distributed training')
-    parser.add_argument('--world_size', type=int, help='world size for distributed training')
-    parser.add_argument('--rank', type=int, help='rank for the default process group')
-    parser.add_argument('--local_rank', type=int, help='local rank on the node')
-    parser.add_argument('--backend', type=str, default='nccl', help='backend for distributed communication')
-    return parser
+from colossalai.utils import set_device, set_seed
 
 
 def launch(config: Union[str, Path, Config, Dict],
@@ -83,40 +42,23 @@ def launch(config: Union[str, Path, Config, Dict],
     Raises:
         Exception: Raise exception when config type is wrong
     """
-    gpc.verbose = verbose
-
-    # set config
-    assert isinstance(config, (Config, str, Path, dict)), \
-        f'expected argument config to be Config, str or Path, but got {type(config)}'
-    if not isinstance(config, Config) and isinstance(config, dict):
-        config = Config(config)
-    if isinstance(config, (str, Path)):
-        config = Config.from_file(config)
-    gpc.load_config(config)
+    if rank == 0:
+        warnings.warn("`config` is deprecated and will be removed soon.")
 
     # init default process group
-    gpc.init_global_dist(rank, world_size, backend, host, port)
-
-    # init process groups for different parallel modes from config
-    gpc.init_parallel_groups()
+    init_method = f'tcp://[{host}]:{port}'
+    dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method)
 
     # set cuda device
     if torch.cuda.is_available():
         # if local rank is not given, calculate automatically
-        gpc.set_device(local_rank)
-
-    # set the number of processes running on the same node
-    gpc.detect_num_processes_on_current_node()
+        set_device(local_rank)
 
-    gpc.set_seed(seed)
+    set_seed(seed)
 
     if verbose:
         logger = get_dist_logger()
-        logger.info(
-            f'Distributed environment is initialized, '
-            f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, '
-            f'tensor parallel size: {gpc.tensor_parallel_size}',
-            ranks=[0])
+        logger.info(f'Distributed environment is initialized, world size: {dist.get_world_size()}', ranks=[0])
 
 
 def launch_from_slurm(config: Union[str, Path, Config, Dict],
@@ -224,247 +166,3 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
            backend=backend,
            seed=seed,
            verbose=verbose)
-
-
-def initialize(model: nn.Module,
-               optimizer: Optimizer,
-               criterion: Optional[_Loss] = None,
-               train_dataloader: Optional[Iterable] = None,
-               test_dataloader: Optional[Iterable] = None,
-               lr_scheduler: Optional[_LRScheduler] = None,
-               ophooks: Optional[List[BaseOpHook]] = None,
-               verbose: bool = True) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
-    """Core function to wrap the essential training components with our functionality based on the config which is
-    loaded into gpc.config.
-
-    Args:
-        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
-        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
-            Your optimizer instance.
-        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
-        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
-        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
-        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
-        verbose (bool, optional): Whether to print logs.
-
-    Returns:
-        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
-            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
-            where only ``engine`` could not be None.
-    """
-    # get logger
-    logger = get_dist_logger()
-    gpc.verbose = verbose
-
-    # get config from gpc
-    config = gpc.config
-
-    # print config
-    if verbose:
-        logger.info(
-            f"\n========== Your Config ========\n"
-            f"{pprint.pformat(gpc.config)}\n"
-            f"================================\n",
-            ranks=[0])
-
-    # cudnn
-    cudnn_benchmark = config.get('cudnn_benchmark', False)
-    cudnn_deterministic = config.get('cudnn_deterministic', False)
-    torch.backends.cudnn.benchmark = cudnn_benchmark
-    torch.backends.cudnn.deterministic = cudnn_deterministic
-    if verbose:
-        logger.info(f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0])
-
-    # zero
-    use_zero = hasattr(gpc.config, 'zero')
-    if use_zero:
-        zero_cfg = gpc.config.get('zero', None)
-        if zero_cfg is not None:
-            cfg_ = zero_cfg.copy()
-        else:
-            cfg_ = {}
-        optimizer_config = zero_cfg.get('optimizer_config', None)
-        model_config = zero_cfg.get('model_config', None)
-        model, optimizer = convert_to_zero_v2(model,
-                                              optimizer,
-                                              model_config=model_config,
-                                              optimizer_config=optimizer_config)
-
-        logger.info("Initializing ZeRO model and optimizer finished!", ranks=[0])
-    else:
-        if isinstance(model, nn.Module):
-            # first sync model across dp ranks
-            model.to(get_current_device())
-        elif isinstance(model, Callable):
-            model = model().to(get_current_device())
-
-        # optimizer maybe a optimizer_cls
-        if isinstance(optimizer, Callable):
-            optimizer = optimizer(model.parameters())
-            logger.warning("Initializing an non ZeRO model with optimizer class")
-
-    if not use_zero:
-        if is_using_sequence():
-            sync_model_param(model, ParallelMode.SEQUENCE_DP)
-        elif MOE_CONTEXT.is_initialized:
-            sync_moe_model_param(model)
-        elif is_using_ddp():
-            sync_model_param(model, ParallelMode.DATA)
-    else:
-        logger.warning(
-            "The parameters of models is not automatically synchronized.\n"
-            "Please make sure that all parameters are the same in data parallel group.",
-            ranks=[0])
-
-    # check amp and zero
-    fp16_cfg = gpc.config.get('fp16', None)
-
-    if fp16_cfg is not None and fp16_cfg.mode is not None and use_zero:
-        raise ConfigException(
-            "It is not allowed to set fp16 and zero configuration in your config file at the same time")
-
-    # clip grad norm
-    clip_grad_norm = gpc.config.get('clip_grad_norm', 0.0)
-
-    # initialize amp
-    amp_mode = None
-    if fp16_cfg is not None and fp16_cfg.mode is not None:
-        cfg_ = fp16_cfg.copy()
-        amp_mode = cfg_.pop('mode')
-        if is_using_pp():
-            assert amp_mode == AMP_TYPE.NAIVE, 'Pipeline only support NaiveAMP currently'
-        if amp_mode == AMP_TYPE.NAIVE:
-            cfg_['clip_grad_norm'] = clip_grad_norm
-        model, optimizer, criterion = convert_to_amp(model=model,
-                                                     optimizer=optimizer,
-                                                     criterion=criterion,
-                                                     mode=amp_mode,
-                                                     amp_config=cfg_)
-
-    # get torch ddp config
-    torch_ddp_cfg = gpc.config.get('torch_ddp', dict())
-
-    # gradient handler
-    gradient_handler_cfg = gpc.config.get('gradient_handler', None)
-    if gradient_handler_cfg is None:
-        # if gradient handler is not specified in the configuration file,
-        # check in the following order
-        # 1. if optimizer is ZERO, then use zero grad handler
-        # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp
-        # 3. if using pipeline and dp size larger than 1, use data parallel grad handler
-        if isinstance(optimizer, ShardedOptimizerV2):
-            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Training with zero is detected, ZeROGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        elif is_using_ddp() and MOE_CONTEXT.is_initialized:
-            gradient_handler_cfg = [dict(type='MoeGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Data parallel training is detected with moe parallel, MoeGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        elif is_using_sequence():
-            model = DDP(model,
-                        process_group=gpc.get_group(ParallelMode.SEQUENCE_DP),
-                        device_ids=[torch.cuda.current_device()],
-                        **torch_ddp_cfg)
-            if verbose:
-                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Sequence Parallelism',
-                            ranks=[0])
-        elif is_using_ddp() and not is_using_pp() and amp_mode != AMP_TYPE.NAIVE:
-            model = DDP(model,
-                        process_group=gpc.get_group(ParallelMode.DATA),
-                        device_ids=[torch.cuda.current_device()],
-                        **torch_ddp_cfg)
-            if verbose:
-                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Data Parallelism', ranks=[0])
-        elif is_using_ddp():
-            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Data parallel training is detected when using pipeline parallel, "
-                    "DataParallelGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        # add pipeline parallel gradient handler, if pipeline shared module is detected
-        for param in model.parameters():
-            if getattr(param, 'pipeline_shared_module_pg', None) is not None:
-                if gradient_handler_cfg is None:
-                    gradient_handler_cfg = [dict(type='PipelineSharedModuleGradientHandler')]
-                else:
-                    gradient_handler_cfg.append(dict(type='PipelineSharedModuleGradientHandler'))
-                if verbose:
-                    logger.info(
-                        "pipeline_shared_module is detected, PipelineSharedModuleGradientHandler is automatically "
-                        "added even though not specified in the configuration",
-                        ranks=[0])
-                break
-    else:
-        if not isinstance(gradient_handler_cfg, list):
-            raise ConfigException(
-                f"expected gradient_handler in the configuration file to be a list but got {type(gradient_handler_cfg)}"
-            )
-
-    # turn off sync buffer for NaiveAMPModel if using torch DDP and NaiveAMPModel at the same time
-    # to avoid duplicated buffer synchronization
-    if isinstance(model, DDP) and isinstance(model.module, NaiveAMPModel):
-        model.module.sync_buffer = False
-
-    # initialize schedule for engine
-    if is_using_pp():
-        tensor_shape = get_tensor_shape()
-        use_interleaved = hasattr(gpc.config, 'model') and hasattr(gpc.config.model, 'num_chunks')
-        if gpc.is_initialized(ParallelMode.PARALLEL_1D):
-            scatter_gather = True
-        else:
-            scatter_gather = False
-        if use_interleaved:
-            if isinstance(model, nn.Sequential):
-                model = nn.ModuleList([model])
-            schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
-                                                   gpc.config.model.num_chunks,
-                                                   tensor_shape=tensor_shape,
-                                                   scatter_gather_tensors=scatter_gather)
-        else:
-            schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
-                                        tensor_shape=tensor_shape,
-                                        scatter_gather_tensors=scatter_gather)
-    else:
-        schedule = NonPipelineSchedule()
-
-    if gradient_handler_cfg is None:
-        gradient_handlers = None
-        if verbose and not isinstance(model, DDP):
-            logger.warning(
-                "No PyTorch DDP or gradient handler is set up, please make sure you do not need "
-                "to all-reduce the gradients after a training step.",
-                ranks=[0])
-    else:
-        gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
-
-    # check if optimizer is ColossalaiOptimizer
-    if not isinstance(optimizer, (ColossalaiOptimizer, ShardedOptimizerV2)):
-        optimizer = ColossalaiOptimizer(optim=optimizer)
-
-    # gradient accumulation
-    grad_accum_size = gpc.config.get('gradient_accumulation', None)
-    if grad_accum_size is not None:
-        optimizer, train_dataloader, gradient_handlers, lr_scheduler = accumulate_gradient(
-            model=model,
-            optimizer=optimizer,
-            dataloader=train_dataloader,
-            accumulate_size=grad_accum_size,
-            gradient_handlers=gradient_handlers,
-            lr_scheduler=lr_scheduler)
-    engine = Engine(model=model,
-                    optimizer=optimizer,
-                    criterion=criterion,
-                    gradient_handlers=gradient_handlers,
-                    clip_grad_norm=clip_grad_norm,
-                    ophook_list=ophooks,
-                    schedule=schedule)
-
-    return engine, train_dataloader, test_dataloader, lr_scheduler
diff --git a/colossalai/legacy/__init__.py b/colossalai/legacy/__init__.py
index e69de29bb..f51941ee8 100644
--- a/colossalai/legacy/__init__.py
+++ b/colossalai/legacy/__init__.py
@@ -0,0 +1,9 @@
+from .initialize import initialize, launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
+
+__all__ = [
+    'launch',
+    'launch_from_openmpi',
+    'launch_from_slurm',
+    'launch_from_torch',
+    'initialize',
+]
diff --git a/colossalai/legacy/amp/__init__.py b/colossalai/legacy/amp/__init__.py
new file mode 100644
index 000000000..e83a7f6ac
--- /dev/null
+++ b/colossalai/legacy/amp/__init__.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch.nn as nn
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
+
+from colossalai.context import Config
+
+from .amp_type import AMP_TYPE
+from .apex_amp import convert_to_apex_amp
+from .naive_amp import convert_to_naive_amp
+from .torch_amp import convert_to_torch_amp
+
+__all__ = ['convert_to_amp', 'convert_to_naive_amp', 'convert_to_apex_amp', 'convert_to_torch_amp', 'AMP_TYPE']
+
+
+def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
+    """A helper function to wrap training components with Torch AMP modules.
+
+    Args:
+        param model (:class:`torch.nn.Module`): your model object.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
+        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
+        mode (:class:`colossalai.legacy.amp.AMP_TYPE`): amp mode.
+        amp_config (Union[:class:`colossalai.context.Config`, dict]): configuration for different amp modes.
+
+    Returns:
+        A tuple (model, optimizer, criterion).
+
+    Note:
+        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
+        for more details about ``amp_config``.
+        For ``apex_amp``, please check
+        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
+        For ``naive_amp``, please check
+        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
+        For ``torch_amp``, please check
+        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
+    """
+    assert isinstance(mode, AMP_TYPE), \
+        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
+
+    if amp_config is None:
+        amp_config = Config()
+
+    if mode == AMP_TYPE.TORCH:
+        model, optimizer, criterion = convert_to_torch_amp(model, optimizer, criterion, amp_config)
+    elif mode == AMP_TYPE.APEX:
+        model, optimizer = convert_to_apex_amp(model, optimizer, amp_config)
+    elif mode == AMP_TYPE.NAIVE:
+        model, optimizer = convert_to_naive_amp(model, optimizer, amp_config)
+
+    return model, optimizer, criterion
diff --git a/colossalai/amp/amp_type.py b/colossalai/legacy/amp/amp_type.py
similarity index 100%
rename from colossalai/amp/amp_type.py
rename to colossalai/legacy/amp/amp_type.py
diff --git a/colossalai/amp/apex_amp/__init__.py b/colossalai/legacy/amp/apex_amp/__init__.py
similarity index 100%
rename from colossalai/amp/apex_amp/__init__.py
rename to colossalai/legacy/amp/apex_amp/__init__.py
diff --git a/colossalai/amp/apex_amp/apex_amp.py b/colossalai/legacy/amp/apex_amp/apex_amp.py
similarity index 86%
rename from colossalai/amp/apex_amp/apex_amp.py
rename to colossalai/legacy/amp/apex_amp/apex_amp.py
index e6bdbe452..acc051181 100644
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/legacy/amp/apex_amp/apex_amp.py
@@ -10,11 +10,11 @@ except ImportError:
 
 from torch import Tensor
 
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils import clip_grad_norm_fp32
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.utils import clip_grad_norm_fp32
 
 
-class ApexAMPOptimizer(ColossalaiOptimizer):
+class ApexAMPOptimizer(OptimizerWrapper):
     """ A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
     methods
     """
diff --git a/colossalai/legacy/amp/naive_amp/__init__.py b/colossalai/legacy/amp/naive_amp/__init__.py
new file mode 100644
index 000000000..2ee84fc76
--- /dev/null
+++ b/colossalai/legacy/amp/naive_amp/__init__.py
@@ -0,0 +1,60 @@
+import inspect
+
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from colossalai.amp.naive_amp.grad_scaler import ConstantGradScaler, DynamicGradScaler
+from colossalai.legacy.utils import is_no_pp_or_last_stage
+
+from ._fp16_optimizer import FP16Optimizer
+from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer
+
+
+def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
+    """A helper function to wrap training components with naive AMP modules. In this mode,
+    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
+    which is equivalent to Apex O3.
+
+    Args:
+        model (:class:`torch.nn.Module`): your model object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
+        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
+
+    Returns:
+        Tuple: A tuple (model, optimizer)
+
+    The ``amp_config`` should contain parameters below::
+
+        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
+                                          Note that clipping is ignored if clip_grad == 0.
+        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
+    """
+    if isinstance(model, nn.ModuleList):
+        # interleaved pipeline
+        module_list = []
+        for chunk, m in enumerate(model):
+            output_to_fp32 = is_no_pp_or_last_stage() and chunk == len(model) - 1
+            module_list.append(NaiveAMPModel(m, output_to_fp32=output_to_fp32))
+        model = nn.ModuleList(module_list)
+    else:
+        output_to_fp32 = is_no_pp_or_last_stage()
+        model = NaiveAMPModel(model, output_to_fp32=output_to_fp32)
+
+    use_dynamic_grad_scaler = amp_config.pop('dynamic_grad_scale', True)
+    if use_dynamic_grad_scaler:
+        scaler_class = DynamicGradScaler
+    else:
+        scaler_class = ConstantGradScaler
+
+    sig = inspect.signature(scaler_class.__init__)
+    kwargs = dict()
+    for param in sig.parameters.values():
+        if param.name in amp_config:
+            kwargs[param.name] = amp_config.pop(param.name)
+    grad_scaler = scaler_class(**kwargs)
+    optimizer = NaiveAMPOptimizer(optimizer, grad_scaler, **amp_config)
+    return model, optimizer
+
+
+__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
diff --git a/colossalai/amp/naive_amp/_fp16_optimizer.py b/colossalai/legacy/amp/naive_amp/_fp16_optimizer.py
similarity index 97%
rename from colossalai/amp/naive_amp/_fp16_optimizer.py
rename to colossalai/legacy/amp/naive_amp/_fp16_optimizer.py
index e4699f92b..273347759 100644
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/legacy/amp/naive_amp/_fp16_optimizer.py
@@ -6,14 +6,15 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.optim import Optimizer
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.amp.naive_amp.grad_scaler import BaseGradScaler
 from colossalai.kernel.op_builder import FusedOptimBuilder
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes
 from colossalai.logging import get_dist_logger
-from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier
+from colossalai.utils import multi_tensor_applier
 
 from ._utils import has_inf_or_nan, zero_gard_by_list
-from .grad_scaler import BaseGradScaler
 
 try:
     from colossalai._C import fused_optim
diff --git a/colossalai/amp/naive_amp/_utils.py b/colossalai/legacy/amp/naive_amp/_utils.py
similarity index 100%
rename from colossalai/amp/naive_amp/_utils.py
rename to colossalai/legacy/amp/naive_amp/_utils.py
diff --git a/colossalai/amp/naive_amp/naive_amp.py b/colossalai/legacy/amp/naive_amp/naive_amp.py
similarity index 94%
rename from colossalai/amp/naive_amp/naive_amp.py
rename to colossalai/legacy/amp/naive_amp/naive_amp.py
index 6a39d518d..1fab3e5a0 100644
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/legacy/amp/naive_amp/naive_amp.py
@@ -11,14 +11,14 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed import ReduceOp
 from torch.optim import Optimizer
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 from ._fp16_optimizer import FP16Optimizer
 
 
-class NaiveAMPOptimizer(ColossalaiOptimizer):
+class NaiveAMPOptimizer(OptimizerWrapper):
     """A wrapper class for optimizer to cast all parameters to fp16
 
     Args:
@@ -57,7 +57,7 @@ class NaiveAMPModel(nn.Module):
     Args:
         model (torch.nn.Module): torch.nn.Module to be wrapped.
         output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this module.
                                                                   (Default: ``ParallelMode.DATA``)
         sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)
 
diff --git a/colossalai/amp/torch_amp/__init__.py b/colossalai/legacy/amp/torch_amp/__init__.py
similarity index 100%
rename from colossalai/amp/torch_amp/__init__.py
rename to colossalai/legacy/amp/torch_amp/__init__.py
diff --git a/colossalai/amp/torch_amp/_grad_scaler.py b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
similarity index 99%
rename from colossalai/amp/torch_amp/_grad_scaler.py
rename to colossalai/legacy/amp/torch_amp/_grad_scaler.py
index ed4b8e484..543dac6ab 100644
--- a/colossalai/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
@@ -13,8 +13,8 @@ import torch.distributed as dist
 from packaging import version
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 
 class _MultiDeviceReplicator(object):
diff --git a/colossalai/amp/torch_amp/torch_amp.py b/colossalai/legacy/amp/torch_amp/torch_amp.py
similarity index 95%
rename from colossalai/amp/torch_amp/torch_amp.py
rename to colossalai/legacy/amp/torch_amp/torch_amp.py
index 65718d77c..c45a5956a 100644
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/legacy/amp/torch_amp/torch_amp.py
@@ -7,13 +7,13 @@ from torch import Tensor
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer
 
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils import clip_grad_norm_fp32
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.utils import clip_grad_norm_fp32
 
 from ._grad_scaler import GradScaler
 
 
-class TorchAMPOptimizer(ColossalaiOptimizer):
+class TorchAMPOptimizer(OptimizerWrapper):
     """A wrapper class which integrate Pytorch AMP with an optimizer
 
     Args:
diff --git a/colossalai/legacy/communication/collective.py b/colossalai/legacy/communication/collective.py
index 64fb5b8b5..747118822 100644
--- a/colossalai/legacy/communication/collective.py
+++ b/colossalai/legacy/communication/collective.py
@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch import Tensor
 from torch.distributed import ReduceOp
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 _all_gather_func = dist._all_gather_base \
     if "all_gather_into_tensor" not in dir(dist) else dist.all_gather_into_tensor
@@ -26,7 +26,7 @@ def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op:
     Args:
         tensor (:class:`torch.Tensor`): Tensor to be gathered.
         dim (int): The dimension concatenating in.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
         async_op (bool, optional): Whether operations are asynchronous.
 
     Returns:
@@ -65,7 +65,7 @@ def reduce_scatter(tensor: Tensor,
     Args:
         tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
         dim (int): The dimension concatenating in.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
         op (torch.distributed.ReduceOp, optional): The type of reduce operation,
             should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
             More details about ReduceOp please refer to
@@ -105,7 +105,7 @@ def all_reduce(tensor: Tensor,
 
     Args:
         tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
         op (torch.distributed.ReduceOp, optional): The type of reduce operation,
             should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
             More details about ReduceOp please refer to
@@ -141,7 +141,7 @@ def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: b
     Args:
         tensor (:class:`torch.Tensor`): Tensor to be broadcast.
         src (int): Source rank.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
         async_op (bool, optional): Whether operations are asynchronous.
 
     Returns:
@@ -173,7 +173,7 @@ def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp =
     Args:
         tensor (:class:`torch.Tensor`): Tensor to be reduced.
         dst (int): Destination rank.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
         async_op (bool, optional): Whether operations are asynchronous.
 
     Returns:
diff --git a/colossalai/legacy/communication/p2p.py b/colossalai/legacy/communication/p2p.py
index d28d14016..e3f9108ab 100644
--- a/colossalai/legacy/communication/p2p.py
+++ b/colossalai/legacy/communication/p2p.py
@@ -8,8 +8,8 @@ from typing import List, Tuple, Union
 import torch
 import torch.distributed as dist
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device
 
 from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
diff --git a/colossalai/legacy/communication/p2p_v2.py b/colossalai/legacy/communication/p2p_v2.py
index 090311cb3..66af21495 100644
--- a/colossalai/legacy/communication/p2p_v2.py
+++ b/colossalai/legacy/communication/p2p_v2.py
@@ -10,8 +10,8 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroupNCCL
 from torch.distributed import distributed_c10d as c10d
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 TensorShape = Union[torch.Size, List[int], Tuple[int]]
 _pg_manager = {}
diff --git a/colossalai/legacy/communication/ring.py b/colossalai/legacy/communication/ring.py
index aece7574b..e80192fb5 100644
--- a/colossalai/legacy/communication/ring.py
+++ b/colossalai/legacy/communication/ring.py
@@ -3,8 +3,8 @@
 
 import torch
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device, synchronize
 
 
diff --git a/colossalai/legacy/communication/utils.py b/colossalai/legacy/communication/utils.py
index 1516df356..7e3dcf1e9 100644
--- a/colossalai/legacy/communication/utils.py
+++ b/colossalai/legacy/communication/utils.py
@@ -3,8 +3,8 @@ from typing import List, Tuple, Union
 import torch
 import torch.distributed as dist
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device
 
 TensorShape = Union[torch.Size, List[int], Tuple[int]]
diff --git a/colossalai/constants.py b/colossalai/legacy/constants.py
similarity index 100%
rename from colossalai/constants.py
rename to colossalai/legacy/constants.py
diff --git a/colossalai/legacy/context/__init__.py b/colossalai/legacy/context/__init__.py
new file mode 100644
index 000000000..7027945ea
--- /dev/null
+++ b/colossalai/legacy/context/__init__.py
@@ -0,0 +1,4 @@
+from .parallel_context import ParallelContext
+from .parallel_mode import ParallelMode
+from .process_group_initializer import *
+from .random import *
diff --git a/colossalai/context/parallel_context.py b/colossalai/legacy/context/parallel_context.py
similarity index 88%
rename from colossalai/context/parallel_context.py
rename to colossalai/legacy/context/parallel_context.py
index 7186f052e..8fdc3d6fe 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/legacy/context/parallel_context.py
@@ -11,10 +11,10 @@ import numpy as np
 import torch
 import torch.distributed as dist
 
-from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
 from colossalai.context.config import Config
 from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.constants import ALLOWED_MODES, INITIALIZER_MAPPING
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 from colossalai.logging import get_dist_logger
 
@@ -110,12 +110,12 @@ class ParallelContext(metaclass=SingletonMeta):
         """Adds the global rank of the current device for `parallel_mode` to the context.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode for the rank.
             rank (int): The rank to be added
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
         """
         self._check_parallel_mode(parallel_mode)
         self._global_ranks[parallel_mode] = rank
@@ -124,11 +124,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the local rank of the current device.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             int: The local rank of the current device for `parallel_mode`.
@@ -140,12 +140,12 @@ class ParallelContext(metaclass=SingletonMeta):
         """Adds the local rank of the current device for `parallel_mode` to the context.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode for the rank.
             rank (int): The rank to be added.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
         """
         self._check_parallel_mode(parallel_mode)
         self._local_ranks[parallel_mode] = rank
@@ -154,11 +154,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the global rank of the next device.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             int: The global rank of the next device for `parallel_mode`.
@@ -176,11 +176,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the global rank of the previous device.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             int: The global rank of the previous device for `parallel_mode`.
@@ -199,11 +199,11 @@ class ParallelContext(metaclass=SingletonMeta):
         among its group for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             bool: a boolean value indicating whether the current device is the first one
@@ -217,11 +217,11 @@ class ParallelContext(metaclass=SingletonMeta):
         among its group for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             bool: a boolean value indicating whether the current device is the first one
@@ -248,11 +248,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the world size for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             int: The world size for `parallel_mode`.
@@ -264,12 +264,12 @@ class ParallelContext(metaclass=SingletonMeta):
         """Adds world size for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode corresponding to the process group
             world_size (int): The world size to be added
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
         """
         self._check_parallel_mode(parallel_mode)
         self._world_sizes[parallel_mode] = world_size
@@ -278,11 +278,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the group of the current device for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
@@ -294,12 +294,12 @@ class ParallelContext(metaclass=SingletonMeta):
         """Adds the group of the current device for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
             group (torch.distributed.ProcessGroup): The group to be added
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
         """
         self._check_parallel_mode(parallel_mode)
         self._groups[parallel_mode] = group
@@ -308,9 +308,9 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the Gloo group of the current device for `parallel_mode`.
 
         :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :type parallel_mode: :class:`colossalai.legacy.context.ParallelMode`
         :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+            of :class:`colossalai.legacy.context.ParallelMode`
         :return: The group of the current device for `parallel_mode`
         :rtype: torch.distributed.ProcessGroup
         """
@@ -321,11 +321,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Adds the Gloo group of the current device for `parallel_mode`.
 
         :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :type parallel_mode: :class:`colossalai.legacy.context.ParallelMode`
         :param group: The group to be added
         :type group: torch.distributed.ProcessGroup
         :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+            of :class:`colossalai.legacy.context.ParallelMode`
         """
         self._check_parallel_mode(parallel_mode)
         self._cpu_groups[parallel_mode] = group
@@ -334,11 +334,11 @@ class ParallelContext(metaclass=SingletonMeta):
         """Returns the rank of the current device for `parallel_mode` in the group.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
 
         Returns:
             int: The rank of the current device for `parallel_mode` in the group.
@@ -350,12 +350,12 @@ class ParallelContext(metaclass=SingletonMeta):
         """Adds the ranks of the current device for `parallel_mode` in the group.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
             ranks (list): List of ranks to be added
 
         Raises:
             AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
         """
         self._check_parallel_mode(parallel_mode)
         self._ranks_in_group[parallel_mode] = ranks
@@ -489,7 +489,7 @@ class ParallelContext(metaclass=SingletonMeta):
         in the current system.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
         Returns:
             bool: a boolean value indicating whether `parallel_mode` is initialized in the current system.
diff --git a/colossalai/context/parallel_mode.py b/colossalai/legacy/context/parallel_mode.py
similarity index 100%
rename from colossalai/context/parallel_mode.py
rename to colossalai/legacy/context/parallel_mode.py
diff --git a/colossalai/context/process_group_initializer/__init__.py b/colossalai/legacy/context/process_group_initializer/__init__.py
similarity index 100%
rename from colossalai/context/process_group_initializer/__init__.py
rename to colossalai/legacy/context/process_group_initializer/__init__.py
index d3937a947..48d52d7b9 100644
--- a/colossalai/context/process_group_initializer/__init__.py
+++ b/colossalai/legacy/context/process_group_initializer/__init__.py
@@ -3,10 +3,10 @@ from .initializer_2d import Initializer_2D
 from .initializer_2p5d import Initializer_2p5D
 from .initializer_3d import Initializer_3D
 from .initializer_data import Initializer_Data
+from .initializer_model import Initializer_Model
 from .initializer_pipeline import Initializer_Pipeline
 from .initializer_sequence import Initializer_Sequence
 from .initializer_tensor import Initializer_Tensor
-from .initializer_model import Initializer_Model
 from .process_group_initializer import ProcessGroupInitializer
 
 __all__ = [
diff --git a/colossalai/context/process_group_initializer/initializer_1d.py b/colossalai/legacy/context/process_group_initializer/initializer_1d.py
similarity index 96%
rename from colossalai/context/process_group_initializer/initializer_1d.py
rename to colossalai/legacy/context/process_group_initializer/initializer_1d.py
index ba601d0bf..d853c6f06 100644
--- a/colossalai/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_1d.py
@@ -3,7 +3,7 @@
 
 import torch.distributed as dist
 
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
diff --git a/colossalai/context/process_group_initializer/initializer_2d.py b/colossalai/legacy/context/process_group_initializer/initializer_2d.py
similarity index 98%
rename from colossalai/context/process_group_initializer/initializer_2d.py
rename to colossalai/legacy/context/process_group_initializer/initializer_2d.py
index 999cd5f0c..39f6a4689 100644
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_2d.py
@@ -2,7 +2,7 @@ import math
 
 import torch.distributed as dist
 
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
diff --git a/colossalai/context/process_group_initializer/initializer_2p5d.py b/colossalai/legacy/context/process_group_initializer/initializer_2p5d.py
similarity index 99%
rename from colossalai/context/process_group_initializer/initializer_2p5d.py
rename to colossalai/legacy/context/process_group_initializer/initializer_2p5d.py
index b92ae2eec..bb7a35095 100644
--- a/colossalai/context/process_group_initializer/initializer_2p5d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_2p5d.py
@@ -6,7 +6,7 @@ import math
 import torch.distributed as dist
 
 from colossalai.context import Config
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
diff --git a/colossalai/context/process_group_initializer/initializer_3d.py b/colossalai/legacy/context/process_group_initializer/initializer_3d.py
similarity index 99%
rename from colossalai/context/process_group_initializer/initializer_3d.py
rename to colossalai/legacy/context/process_group_initializer/initializer_3d.py
index 6bca05ad7..3dfbf5223 100644
--- a/colossalai/context/process_group_initializer/initializer_3d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_3d.py
@@ -5,7 +5,7 @@ import math
 
 import torch.distributed as dist
 
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 
 from ..parallel_mode import ParallelMode
diff --git a/colossalai/context/process_group_initializer/initializer_data.py b/colossalai/legacy/context/process_group_initializer/initializer_data.py
similarity index 100%
rename from colossalai/context/process_group_initializer/initializer_data.py
rename to colossalai/legacy/context/process_group_initializer/initializer_data.py
diff --git a/colossalai/context/process_group_initializer/initializer_model.py b/colossalai/legacy/context/process_group_initializer/initializer_model.py
similarity index 100%
rename from colossalai/context/process_group_initializer/initializer_model.py
rename to colossalai/legacy/context/process_group_initializer/initializer_model.py
diff --git a/colossalai/context/process_group_initializer/initializer_pipeline.py b/colossalai/legacy/context/process_group_initializer/initializer_pipeline.py
similarity index 100%
rename from colossalai/context/process_group_initializer/initializer_pipeline.py
rename to colossalai/legacy/context/process_group_initializer/initializer_pipeline.py
diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/legacy/context/process_group_initializer/initializer_sequence.py
similarity index 100%
rename from colossalai/context/process_group_initializer/initializer_sequence.py
rename to colossalai/legacy/context/process_group_initializer/initializer_sequence.py
diff --git a/colossalai/context/process_group_initializer/initializer_tensor.py b/colossalai/legacy/context/process_group_initializer/initializer_tensor.py
similarity index 100%
rename from colossalai/context/process_group_initializer/initializer_tensor.py
rename to colossalai/legacy/context/process_group_initializer/initializer_tensor.py
diff --git a/colossalai/context/process_group_initializer/process_group_initializer.py b/colossalai/legacy/context/process_group_initializer/process_group_initializer.py
similarity index 100%
rename from colossalai/context/process_group_initializer/process_group_initializer.py
rename to colossalai/legacy/context/process_group_initializer/process_group_initializer.py
diff --git a/colossalai/context/random/__init__.py b/colossalai/legacy/context/random/__init__.py
similarity index 100%
rename from colossalai/context/random/__init__.py
rename to colossalai/legacy/context/random/__init__.py
diff --git a/colossalai/context/random/_helper.py b/colossalai/legacy/context/random/_helper.py
similarity index 90%
rename from colossalai/context/random/_helper.py
rename to colossalai/legacy/context/random/_helper.py
index 973c4d9fa..4b5d5ef2f 100644
--- a/colossalai/context/random/_helper.py
+++ b/colossalai/legacy/context/random/_helper.py
@@ -7,8 +7,8 @@ from contextlib import contextmanager
 import torch.cuda
 from torch import Tensor
 
-from .seed_manager import SeedManager
 from ..parallel_mode import ParallelMode
+from .seed_manager import SeedManager
 
 _SEED_MANAGER = SeedManager()
 
@@ -53,11 +53,11 @@ def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
     """Adds a seed to the seed manager for `parallel_mode`.
 
     Args:
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
         seed (int): The seed to be added
     Raises:
         AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
-            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
+            :class:`colossalai.legacy.context.ParallelMode` or the seed for `parallel_mode` has been added.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -70,7 +70,7 @@ def set_mode(parallel_mode: ParallelMode):
     """Sets the current mode of the seed manager.
 
     Args:
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -83,7 +83,7 @@ def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
     """Sets the state of the seed manager for `parallel_mode`.
 
     Args:
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
         state (:class:`torch.Tensor`): the state to be set.
 
     Raises:
@@ -161,7 +161,7 @@ def with_seed(func, parallel_mode: ParallelMode):
 
 def moe_set_seed(seed):
     if torch.cuda.is_available():
-        from colossalai.core import global_context as gpc
+        from colossalai.legacy.core import global_context as gpc
         global_rank = gpc.get_global_rank()
         diff_seed = seed + global_rank
         add_seed(ParallelMode.TENSOR, diff_seed, True)
diff --git a/colossalai/context/random/seed_manager.py b/colossalai/legacy/context/random/seed_manager.py
similarity index 86%
rename from colossalai/context/random/seed_manager.py
rename to colossalai/legacy/context/random/seed_manager.py
index 956f90012..b657ff7e1 100644
--- a/colossalai/context/random/seed_manager.py
+++ b/colossalai/legacy/context/random/seed_manager.py
@@ -4,7 +4,7 @@
 import torch
 from torch import Tensor
 
-from colossalai.context.parallel_mode import ParallelMode
+from colossalai.legacy.context.parallel_mode import ParallelMode
 
 
 class SeedManager:
@@ -36,7 +36,7 @@ class SeedManager:
         """Sets the state of the seed manager for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
             state (:class:`torch.Tensor`): the state to be set.
 
         Raises:
@@ -49,7 +49,7 @@ class SeedManager:
         """Sets the current mode of the seed manager.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
         """
         if self.current_mode:
             # save the current state for current mode
@@ -63,12 +63,12 @@ class SeedManager:
         """Adds a seed to the seed manager for `parallel_mode`.
 
         Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
             seed (int): The seed to be added.
             overwrite (bool, optional): Whether allows to overwrite the seed that has been set already
 
         Raises:
-            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.legacy.context.ParallelMode`
                 or the seed for `parallel_mode` has been added.
         """
         assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
diff --git a/colossalai/legacy/core.py b/colossalai/legacy/core.py
new file mode 100644
index 000000000..0aaf1ee47
--- /dev/null
+++ b/colossalai/legacy/core.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from colossalai.legacy.context.parallel_context import global_context
+
+__all__ = ['global_context']
diff --git a/colossalai/legacy/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py
index 9af4469f4..930caf20c 100644
--- a/colossalai/legacy/engine/_base_engine.py
+++ b/colossalai/legacy/engine/_base_engine.py
@@ -8,6 +8,7 @@ from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 
+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 from colossalai.legacy.engine.schedule import (
     BaseSchedule,
@@ -15,9 +16,8 @@ from colossalai.legacy.engine.schedule import (
     NonPipelineSchedule,
     PipelineSchedule,
 )
+from colossalai.legacy.zero.gemini import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 
 
 class Engine:
@@ -27,7 +27,7 @@ class Engine:
 
     Args:
         model (``torch.nn.Module``): The neural network model.
-        optimizer (``colossalai.nn.optimizer.ColossalaiOptimizer``): Optimizer for updating the parameters.
+        optimizer (``colossalai.interface.OptimizerWrapper``): Optimizer for updating the parameters.
         criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
         gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
         clip_grad_norm (float, optional): The norm of gradient clipping.
@@ -61,7 +61,7 @@ class Engine:
 
     def __init__(self,
                  model: Module,
-                 optimizer: "ColossalaiOptimizer",
+                 optimizer: "OptimizerWrapper",
                  criterion: Optional[_Loss] = None,
                  gradient_handlers: Optional[List[BaseGradientHandler]] = None,
                  clip_grad_norm: float = 0.0,
@@ -157,7 +157,7 @@ class Engine:
         """Execute parameter update
         """
         self._all_reduce_gradients()
-        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
+        self.optimizer.clip_grad_by_norm(self._clip_grad_norm)
         return self.optimizer.step()
 
     def backward(self, loss: Tensor):
diff --git a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
index c466f7e2d..c2270dc53 100644
--- a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
@@ -10,12 +10,12 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
 
+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.engine import BaseGradientHandler
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import conditional_context
 
 
-class GradAccumOptimizer(ColossalaiOptimizer):
+class GradAccumOptimizer(OptimizerWrapper):
     """A wrapper for the optimizer to enable gradient accumulation by skipping the steps
     before accumulation size is reached.
 
@@ -74,7 +74,7 @@ class GradAccumOptimizer(ColossalaiOptimizer):
         if self.accumulate_step < self.accumulate_size:
             pass
         else:
-            self.optim.clip_grad_norm(model, max_norm)
+            self.optim.clip_grad_by_norm(max_norm)
 
     def backward(self, loss: Tensor) -> None:
         """Execute backward pass.
diff --git a/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
index c5da2e55a..c692ee903 100644
--- a/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
@@ -1,5 +1,5 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
diff --git a/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
index 395d83da0..e7a6df2d8 100644
--- a/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
@@ -1,6 +1,6 @@
 from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.utils.moe import get_moe_epsize_param_dict
 
diff --git a/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
index 7d4d9d73a..3eae7d58a 100644
--- a/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -6,7 +6,7 @@ import torch
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
diff --git a/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
index 41098ab39..38b7f5993 100644
--- a/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
@@ -1,5 +1,5 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER
 
 from ._base_gradient_handler import BaseGradientHandler
diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
index 4571fd679..37eed82f8 100644
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@@ -7,11 +7,11 @@ from typing import Callable, List, Tuple, Union
 import torch.cuda
 
 import colossalai.legacy.communication as comm
-from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.amp.naive_amp import NaiveAMPModel
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.logging import get_dist_logger
-from colossalai.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.utils.cuda import get_current_device
 
 from ._base_schedule import BaseSchedule
@@ -157,7 +157,7 @@ class PipelineSchedule(BaseSchedule):
         return self._move_to_device(micro_batch_data)
 
     def pre_processing(self, engine):
-        from colossalai.zero.legacy import ShardedModelV2
+        from colossalai.legacy.zero import ShardedModelV2
 
         # TODO: remove this after testing new zero with pipeline parallelism
         model = engine.model
diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
index 385c61537..bf8b599a8 100644
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
@@ -6,8 +6,8 @@ from typing import Iterable, Tuple
 import torch.cuda
 
 import colossalai.legacy.communication.p2p_v2 as comm
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.engine import Engine
 from colossalai.utils.cuda import get_current_device
 
diff --git a/colossalai/global_variables.py b/colossalai/legacy/global_variables.py
similarity index 100%
rename from colossalai/global_variables.py
rename to colossalai/legacy/global_variables.py
diff --git a/colossalai/legacy/initialize.py b/colossalai/legacy/initialize.py
new file mode 100644
index 000000000..2c253adba
--- /dev/null
+++ b/colossalai/legacy/initialize.py
@@ -0,0 +1,472 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import argparse
+import os
+import pprint
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.loss import _Loss
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+
+from colossalai.context import Config, ConfigException
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.amp import AMP_TYPE, convert_to_amp
+from colossalai.legacy.amp.naive_amp import NaiveAMPModel
+from colossalai.legacy.builder.builder import build_gradient_handler
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.engine import Engine
+from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
+from colossalai.legacy.engine.schedule import (
+    InterleavedPipelineSchedule,
+    NonPipelineSchedule,
+    PipelineSchedule,
+    get_tensor_shape,
+)
+from colossalai.legacy.utils import is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
+from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_current_device
+from colossalai.utils.moe import sync_moe_model_param
+
+
+def get_default_parser():
+    """Reads user command line and uses an argument parser to parse the input arguments.
+    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
+
+    Returns:
+       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, help='path to the config file')
+    parser.add_argument('--host', type=str, help='the master address for distributed training')
+    parser.add_argument('--port', type=int, help='the master port for distributed training')
+    parser.add_argument('--world_size', type=int, help='world size for distributed training')
+    parser.add_argument('--rank', type=int, help='rank for the default process group')
+    parser.add_argument('--local_rank', type=int, help='local rank on the node')
+    parser.add_argument('--backend', type=str, default='nccl', help='backend for distributed communication')
+    return parser
+
+
+def launch(config: Union[str, Path, Config, Dict],
+           rank: int,
+           world_size: int,
+           host: str,
+           port: int,
+           backend: str = 'nccl',
+           local_rank: int = None,
+           seed: int = 1024,
+           verbose: bool = True):
+    """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
+    arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        rank (int): Rank for the default process group
+        world_size (int): World size of the default process group
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        local_rank (int, optional):
+            Rank for the process on the node and is used to set the default CUDA device,
+            defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+
+    Raises:
+        Exception: Raise exception when config type is wrong
+    """
+    gpc.verbose = verbose
+
+    # set config
+    assert isinstance(config, (Config, str, Path, dict)), \
+        f'expected argument config to be Config, str or Path, but got {type(config)}'
+    if not isinstance(config, Config) and isinstance(config, dict):
+        config = Config(config)
+    if isinstance(config, (str, Path)):
+        config = Config.from_file(config)
+    gpc.load_config(config)
+
+    # init default process group
+    gpc.init_global_dist(rank, world_size, backend, host, port)
+
+    # init process groups for different parallel modes from config
+    gpc.init_parallel_groups()
+
+    # set cuda device
+    if torch.cuda.is_available():
+        # if local rank is not given, calculate automatically
+        gpc.set_device(local_rank)
+
+    # set the number of processes running on the same node
+    gpc.detect_num_processes_on_current_node()
+
+    gpc.set_seed(seed)
+
+    if verbose:
+        logger = get_dist_logger()
+        logger.info(
+            f'Distributed environment is initialized, '
+            f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, '
+            f'tensor parallel size: {gpc.tensor_parallel_size}',
+            ranks=[0])
+
+
+def launch_from_slurm(config: Union[str, Path, Config, Dict],
+                      host: str,
+                      port: int,
+                      backend: str = 'nccl',
+                      seed: int = 1024,
+                      verbose: bool = True):
+    """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
+    set by SLURM
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+    """
+    try:
+        rank = int(os.environ['SLURM_PROCID'])
+        world_size = int(os.environ['SLURM_NPROCS'])
+    except KeyError as e:
+        raise RuntimeError(
+            f"Could not find {e} in the SLURM environment, visit https://www.colossalai.org/ for more information on launching with SLURM"
+        )
+
+    launch(config=config,
+           rank=rank,
+           world_size=world_size,
+           host=host,
+           port=port,
+           backend=backend,
+           seed=seed,
+           verbose=verbose)
+
+
+def launch_from_openmpi(config: Union[str, Path, Config, Dict],
+                        host: str,
+                        port: int,
+                        backend: str = 'nccl',
+                        seed: int = 1024,
+                        verbose: bool = True):
+    """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
+    set by OpenMPI
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+    """
+    try:
+        rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+    except KeyError as e:
+        raise RuntimeError(
+            f"Could not find {e} in the OpenMPI environment, visit https://www.colossalai.org/ for more information on launching with OpenMPI"
+        )
+
+    launch(config=config,
+           local_rank=local_rank,
+           rank=rank,
+           world_size=world_size,
+           host=host,
+           port=port,
+           backend=backend,
+           seed=seed,
+           verbose=verbose)
+
+
+def launch_from_torch(config: Union[str, Path, Config, Dict],
+                      backend: str = 'nccl',
+                      seed: int = 1024,
+                      verbose: bool = True):
+    """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
+    from the environment variables set by PyTorch
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+    """
+    try:
+        rank = int(os.environ['RANK'])
+        local_rank = int(os.environ['LOCAL_RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        host = os.environ['MASTER_ADDR']
+        port = int(os.environ['MASTER_PORT'])
+    except KeyError as e:
+        raise RuntimeError(
+            f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+        )
+
+    launch(config=config,
+           local_rank=local_rank,
+           rank=rank,
+           world_size=world_size,
+           host=host,
+           port=port,
+           backend=backend,
+           seed=seed,
+           verbose=verbose)
+
+
+def initialize(model: nn.Module,
+               optimizer: Optimizer,
+               criterion: Optional[_Loss] = None,
+               train_dataloader: Optional[Iterable] = None,
+               test_dataloader: Optional[Iterable] = None,
+               lr_scheduler: Optional[_LRScheduler] = None,
+               ophooks: Optional[List[BaseOpHook]] = None,
+               verbose: bool = True) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
+    """Core function to wrap the essential training components with our functionality based on the config which is
+    loaded into gpc.config.
+
+    Args:
+        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
+        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
+            Your optimizer instance.
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
+        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
+        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
+        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
+        verbose (bool, optional): Whether to print logs.
+
+    Returns:
+        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
+            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
+            where only ``engine`` could not be None.
+    """
+    # get logger
+    logger = get_dist_logger()
+    gpc.verbose = verbose
+
+    # get config from gpc
+    config = gpc.config
+
+    # print config
+    if verbose:
+        logger.info(
+            f"\n========== Your Config ========\n"
+            f"{pprint.pformat(gpc.config)}\n"
+            f"================================\n",
+            ranks=[0])
+
+    # cudnn
+    cudnn_benchmark = config.get('cudnn_benchmark', False)
+    cudnn_deterministic = config.get('cudnn_deterministic', False)
+    torch.backends.cudnn.benchmark = cudnn_benchmark
+    torch.backends.cudnn.deterministic = cudnn_deterministic
+    if verbose:
+        logger.info(f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0])
+
+    # zero
+    use_zero = hasattr(gpc.config, 'zero')
+    if use_zero:
+        zero_cfg = gpc.config.get('zero', None)
+        if zero_cfg is not None:
+            cfg_ = zero_cfg.copy()
+        else:
+            cfg_ = {}
+        optimizer_config = zero_cfg.get('optimizer_config', None)
+        model_config = zero_cfg.get('model_config', None)
+        model, optimizer = convert_to_zero_v2(model,
+                                              optimizer,
+                                              model_config=model_config,
+                                              optimizer_config=optimizer_config)
+
+        logger.info("Initializing ZeRO model and optimizer finished!", ranks=[0])
+    else:
+        if isinstance(model, nn.Module):
+            # first sync model across dp ranks
+            model.to(get_current_device())
+        elif isinstance(model, Callable):
+            model = model().to(get_current_device())
+
+        # optimizer maybe a optimizer_cls
+        if isinstance(optimizer, Callable):
+            optimizer = optimizer(model.parameters())
+            logger.warning("Initializing an non ZeRO model with optimizer class")
+
+    if not use_zero:
+        if is_using_sequence():
+            sync_model_param(model, ParallelMode.SEQUENCE_DP)
+        elif MOE_CONTEXT.is_initialized:
+            sync_moe_model_param(model)
+        elif is_using_ddp():
+            sync_model_param(model, ParallelMode.DATA)
+    else:
+        logger.warning(
+            "The parameters of models is not automatically synchronized.\n"
+            "Please make sure that all parameters are the same in data parallel group.",
+            ranks=[0])
+
+    # check amp and zero
+    fp16_cfg = gpc.config.get('fp16', None)
+
+    if fp16_cfg is not None and fp16_cfg.mode is not None and use_zero:
+        raise ConfigException(
+            "It is not allowed to set fp16 and zero configuration in your config file at the same time")
+
+    # clip grad norm
+    clip_grad_norm = gpc.config.get('clip_grad_norm', 0.0)
+
+    # initialize amp
+    amp_mode = None
+    if fp16_cfg is not None and fp16_cfg.mode is not None:
+        cfg_ = fp16_cfg.copy()
+        amp_mode = cfg_.pop('mode')
+        if is_using_pp():
+            assert amp_mode == AMP_TYPE.NAIVE, 'Pipeline only support NaiveAMP currently'
+        if amp_mode == AMP_TYPE.NAIVE:
+            cfg_['clip_grad_norm'] = clip_grad_norm
+        model, optimizer, criterion = convert_to_amp(model=model,
+                                                     optimizer=optimizer,
+                                                     criterion=criterion,
+                                                     mode=amp_mode,
+                                                     amp_config=cfg_)
+
+    # get torch ddp config
+    torch_ddp_cfg = gpc.config.get('torch_ddp', dict())
+
+    # gradient handler
+    gradient_handler_cfg = gpc.config.get('gradient_handler', None)
+    if gradient_handler_cfg is None:
+        # if gradient handler is not specified in the configuration file,
+        # check in the following order
+        # 1. if optimizer is ZERO, then use zero grad handler
+        # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp
+        # 3. if using pipeline and dp size larger than 1, use data parallel grad handler
+        if isinstance(optimizer, ShardedOptimizerV2):
+            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
+            if verbose:
+                logger.info(
+                    "Training with zero is detected, ZeROGradientHandler is automatically "
+                    "added even though not specified in the configuration",
+                    ranks=[0])
+        elif is_using_ddp() and MOE_CONTEXT.is_initialized:
+            gradient_handler_cfg = [dict(type='MoeGradientHandler')]
+            if verbose:
+                logger.info(
+                    "Data parallel training is detected with moe parallel, MoeGradientHandler is automatically "
+                    "added even though not specified in the configuration",
+                    ranks=[0])
+        elif is_using_sequence():
+            model = DDP(model,
+                        process_group=gpc.get_group(ParallelMode.SEQUENCE_DP),
+                        device_ids=[torch.cuda.current_device()],
+                        **torch_ddp_cfg)
+            if verbose:
+                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Sequence Parallelism',
+                            ranks=[0])
+        elif is_using_ddp() and not is_using_pp() and amp_mode != AMP_TYPE.NAIVE:
+            model = DDP(model,
+                        process_group=gpc.get_group(ParallelMode.DATA),
+                        device_ids=[torch.cuda.current_device()],
+                        **torch_ddp_cfg)
+            if verbose:
+                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Data Parallelism', ranks=[0])
+        elif is_using_ddp():
+            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
+            if verbose:
+                logger.info(
+                    "Data parallel training is detected when using pipeline parallel, "
+                    "DataParallelGradientHandler is automatically "
+                    "added even though not specified in the configuration",
+                    ranks=[0])
+        # add pipeline parallel gradient handler, if pipeline shared module is detected
+        for param in model.parameters():
+            if getattr(param, 'pipeline_shared_module_pg', None) is not None:
+                if gradient_handler_cfg is None:
+                    gradient_handler_cfg = [dict(type='PipelineSharedModuleGradientHandler')]
+                else:
+                    gradient_handler_cfg.append(dict(type='PipelineSharedModuleGradientHandler'))
+                if verbose:
+                    logger.info(
+                        "pipeline_shared_module is detected, PipelineSharedModuleGradientHandler is automatically "
+                        "added even though not specified in the configuration",
+                        ranks=[0])
+                break
+    else:
+        if not isinstance(gradient_handler_cfg, list):
+            raise ConfigException(
+                f"expected gradient_handler in the configuration file to be a list but got {type(gradient_handler_cfg)}"
+            )
+
+    # turn off sync buffer for NaiveAMPModel if using torch DDP and NaiveAMPModel at the same time
+    # to avoid duplicated buffer synchronization
+    if isinstance(model, DDP) and isinstance(model.module, NaiveAMPModel):
+        model.module.sync_buffer = False
+
+    # initialize schedule for engine
+    if is_using_pp():
+        tensor_shape = get_tensor_shape()
+        use_interleaved = hasattr(gpc.config, 'model') and hasattr(gpc.config.model, 'num_chunks')
+        if gpc.is_initialized(ParallelMode.PARALLEL_1D):
+            scatter_gather = True
+        else:
+            scatter_gather = False
+        if use_interleaved:
+            if isinstance(model, nn.Sequential):
+                model = nn.ModuleList([model])
+            schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
+                                                   gpc.config.model.num_chunks,
+                                                   tensor_shape=tensor_shape,
+                                                   scatter_gather_tensors=scatter_gather)
+        else:
+            schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
+                                        tensor_shape=tensor_shape,
+                                        scatter_gather_tensors=scatter_gather)
+    else:
+        schedule = NonPipelineSchedule()
+
+    if gradient_handler_cfg is None:
+        gradient_handlers = None
+        if verbose and not isinstance(model, DDP):
+            logger.warning(
+                "No PyTorch DDP or gradient handler is set up, please make sure you do not need "
+                "to all-reduce the gradients after a training step.",
+                ranks=[0])
+    else:
+        gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
+
+    # check if optimizer is OptimizerWrapper
+    if not isinstance(optimizer, (OptimizerWrapper, ShardedOptimizerV2)):
+        optimizer = OptimizerWrapper(optim=optimizer)
+
+    # gradient accumulation
+    grad_accum_size = gpc.config.get('gradient_accumulation', None)
+    if grad_accum_size is not None:
+        optimizer, train_dataloader, gradient_handlers, lr_scheduler = accumulate_gradient(
+            model=model,
+            optimizer=optimizer,
+            dataloader=train_dataloader,
+            accumulate_size=grad_accum_size,
+            gradient_handlers=gradient_handlers,
+            lr_scheduler=lr_scheduler)
+    engine = Engine(model=model,
+                    optimizer=optimizer,
+                    criterion=criterion,
+                    gradient_handlers=gradient_handlers,
+                    clip_grad_norm=clip_grad_norm,
+                    ophook_list=ophooks,
+                    schedule=schedule)
+
+    return engine, train_dataloader, test_dataloader, lr_scheduler
diff --git a/colossalai/legacy/nn/__init__.py b/colossalai/legacy/nn/__init__.py
index 500162901..d30ebf8d5 100644
--- a/colossalai/legacy/nn/__init__.py
+++ b/colossalai/legacy/nn/__init__.py
@@ -1,4 +1,3 @@
-from ._ops import *
 from .layer import *
 from .loss import *
 from .metric import *
diff --git a/colossalai/legacy/nn/_ops/__init__.py b/colossalai/legacy/nn/_ops/__init__.py
index 4991ad9a2..9a35d02ce 100644
--- a/colossalai/legacy/nn/_ops/__init__.py
+++ b/colossalai/legacy/nn/_ops/__init__.py
@@ -1,9 +1 @@
-from .addmm import colo_addmm
-from .batch_norm import colo_batch_norm
-from .element_wise import *
-from .embedding import colo_embedding
-from .embedding_bag import colo_embedding_bag
-from .layernorm import colo_layernorm
-from .linear import colo_linear
-from .loss import colo_cross_entropy
-from .view import colo_view
+from ._utils import *
diff --git a/colossalai/legacy/nn/_ops/_utils.py b/colossalai/legacy/nn/_ops/_utils.py
index 131c21547..a4228fa21 100644
--- a/colossalai/legacy/nn/_ops/_utils.py
+++ b/colossalai/legacy/nn/_ops/_utils.py
@@ -3,9 +3,10 @@ from typing import List, Optional, Union
 import torch
 import torch.distributed as dist
 
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.utils import divide
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
+from colossalai.legacy.tensor import ColoTensorSpec, ProcessGroup
+from colossalai.tensor import ColoTensor
 
 GeneralTensor = Union[ColoTensor, torch.Tensor]
 Number = Union[int, float]
diff --git a/colossalai/legacy/nn/_ops/addmm.py b/colossalai/legacy/nn/_ops/addmm.py
deleted file mode 100644
index 660b48a71..000000000
--- a/colossalai/legacy/nn/_ops/addmm.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import torch
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
-
-
-def colo_addmm_1Drow(input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTensor, beta: Number,
-                     alpha: Number) -> ColoTensor:
-    # mat1:S[1] x mat2:S[0] = Output:P
-    # beta * input + alpha * All-Reduce(Output) = res
-
-    mat1 = mat1.redistribute(ShardSpec([-1], [mat2.get_tp_world_size()]), mat2.get_process_group())
-
-    # Output:P
-    partial_output = torch.mm(mat1, mat2)
-    # Reduce(Output)
-    output = reduce_input(partial_output, mat2.get_process_group())
-    # input
-    assert not input_tensor.has_compute_spec(), 'Invalid input spec for 1Drow addmm op'
-    output = beta * input_tensor + alpha * output
-    output = ColoTensor.from_torch_tensor(output, spec=ColoTensorSpec(input_tensor.get_process_group()))
-    return output
-
-
-def colo_addmm_1Dcol(input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTensor, beta: Number,
-                     alpha: Number) -> ColoTensor:
-    # mat1:B x mat2:S[1] + input:S[1] = Output:S[1]
-    compute_spec = mat2.compute_spec
-    mat1 = mat1.redistribute(ReplicaSpec())
-    mat1 = reduce_grad(mat1, mat1.get_process_group())
-
-    output_parallel = torch.addmm(input_tensor, mat1, mat2, beta=beta, alpha=alpha)
-    output_spec = ColoTensorSpec(input_tensor.get_process_group(), ShardSpec([-1], [mat2.get_tp_world_size()]),
-                                 ComputeSpec(ComputePattern.TP1D))
-    output = ColoTensor.from_torch_tensor(output_parallel, spec=output_spec)
-
-    if compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_addmm_1d(mode: str, input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTensor, beta: Number,
-                  alpha: Number) -> ColoTensor:
-    assert mode in ('row', 'col')
-    funcs = {'row': colo_addmm_1Drow, 'col': colo_addmm_1Dcol}
-    return funcs[mode](input_tensor, mat1, mat2, beta, alpha)
-
-
-@colo_op_impl(torch.addmm)
-def colo_addmm(input_tensor: GeneralTensor,
-               mat1: ColoTensor,
-               mat2: ColoTensor,
-               beta: Number = 1,
-               alpha: Number = 1,
-               **kargs) -> ColoTensor:
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.linear``.
-    This method computes a linear.
-    """
-    # At least one of the tensor should be ColoTensor
-    assert isinstance(mat2, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, mat2.get_process_group())
-    mat1 = convert_to_colo_tensor(mat1, mat2.get_process_group())
-
-    # Add communication logic before and after linear call.
-    ret_tensor = None
-    if not mat2.has_compute_spec():    # No Model Parallel Applied
-        assert mat2.is_replicate(), 'Invalid mat2 spec for native addmm op'
-        assert input_tensor.is_replicate(), 'Invalid input spec for native addmm op'
-        ret_tensor = ColoTensor.from_torch_tensor(tensor=torch.addmm(input_tensor,
-                                                                     mat1,
-                                                                     mat2,
-                                                                     beta=beta,
-                                                                     alpha=alpha,
-                                                                     **kargs),
-                                                  spec=ColoTensorSpec(mat2.get_process_group()))
-    elif mat2.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if mat2.is_shard_1drow() and input_tensor.is_replicate():
-            mode = 'row'
-        elif mat2.is_shard_1dcol() and (input_tensor.is_shard_1dcol() or input_tensor.is_shard_1drow()):
-            mode = 'col'
-        else:
-            raise NotImplementedError
-        ret_tensor = colo_addmm_1d(mode, input_tensor, mat1, mat2, beta, alpha)
-    else:
-        raise NotImplementedError
-
-    return ret_tensor
diff --git a/colossalai/legacy/nn/_ops/batch_norm.py b/colossalai/legacy/nn/_ops/batch_norm.py
deleted file mode 100644
index 54ecc88f4..000000000
--- a/colossalai/legacy/nn/_ops/batch_norm.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from typing import Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-@colo_op_impl(F.batch_norm)
-def colo_batch_norm(
-    input: GeneralTensor,
-    running_mean: Optional[GeneralTensor],
-    running_var: Optional[GeneralTensor],
-    weight: Optional[GeneralTensor] = None,
-    bias: Optional[GeneralTensor] = None,
-    training: bool = False,
-    momentum: float = 0.1,
-    eps: float = 1e-5,
-):
-    assert isinstance(weight, ColoTensor)
-    running_mean = running_mean.detach()
-    running_var = running_var.detach()
-
-    input = convert_to_colo_tensor(input, weight.get_process_group())
-    bias = convert_to_colo_tensor(bias, weight.get_process_group())
-    input = input.redistribute(ReplicaSpec())
-    bias = bias.redistribute(ReplicaSpec())
-
-    output = F.batch_norm(input, running_mean, running_var, weight, bias, training, momentum, eps)
-    output = ColoTensor.from_torch_tensor(tensor=output, spec=ColoTensorSpec(pg=weight.get_process_group()))
-    return output
diff --git a/colossalai/legacy/nn/_ops/element_wise.py b/colossalai/legacy/nn/_ops/element_wise.py
deleted file mode 100644
index 2de51e24a..000000000
--- a/colossalai/legacy/nn/_ops/element_wise.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-def register_elementwise_op(op):
-
-    @colo_op_impl(op)
-    def elementwise_op(input_tensor: GeneralTensor, *args, **kwargs):
-        """
-        Handles ``__torch_function__`` dispatch for the elementwise op such
-        as ``torch.nn.functional.gelu`` or ``torch.nn.functional.relu``.
-        This method computes on either a normal tensor or a sharded tensor.
-        """
-        if 'inplace' in kwargs:
-            # TODO(jiaruifang) inplace will cause bugs
-            input_tensor = input_tensor.clone()
-            return op(input_tensor, *args, **kwargs)
-        else:
-            output = op(input_tensor, *args, **kwargs)
-            # return output
-            if isinstance(input_tensor, ColoTensor):
-                if isinstance(output, str):
-                    return output
-                if not isinstance(output, torch.Tensor):
-                    raise NotImplementedError
-                return ColoTensor.from_torch_tensor(output,
-                                                    spec=ColoTensorSpec(input_tensor.get_process_group(),
-                                                                        dist_attr=input_tensor.dist_spec))
-
-
-# @colo_op_impl(torch.relu_)
-# def elementwise_op(input_tensor):
-#     torch.relu_(input_tensor.data)
-#     return input_tensor
-
-# @colo_op_impl(Tensor.add_)
-# def elementwise_op(input_tensor: ColoTensor, *args, **kwargs):
-#     input_tensor = input_tensor.data.add_(*args, **kwargs)
-#     return input_tensor
-
-# Tensor op
-register_elementwise_op(Tensor.abs)
-register_elementwise_op(Tensor.absolute)
-register_elementwise_op(Tensor.acos)
-register_elementwise_op(Tensor.arccos)
-register_elementwise_op(Tensor.angle)
-register_elementwise_op(Tensor.asin)
-register_elementwise_op(Tensor.arcsin)
-register_elementwise_op(Tensor.atan)
-register_elementwise_op(Tensor.arctan)
-register_elementwise_op(Tensor.all)
-register_elementwise_op(Tensor.any)
-register_elementwise_op(Tensor.bernoulli)
-register_elementwise_op(Tensor.bfloat16)
-register_elementwise_op(Tensor.bitwise_not)
-register_elementwise_op(Tensor.bool)
-register_elementwise_op(Tensor.byte)
-register_elementwise_op(Tensor.ceil)
-register_elementwise_op(Tensor.char)
-register_elementwise_op(Tensor.clamp)
-register_elementwise_op(Tensor.clamp_max)
-register_elementwise_op(Tensor.clamp_min)
-register_elementwise_op(Tensor.clip)
-register_elementwise_op(Tensor.clone)
-register_elementwise_op(Tensor.contiguous)
-register_elementwise_op(Tensor.copysign)
-register_elementwise_op(Tensor.cos)
-register_elementwise_op(Tensor.cosh)
-register_elementwise_op(Tensor.acosh)
-register_elementwise_op(Tensor.arccosh)
-register_elementwise_op(Tensor.cpu)
-register_elementwise_op(Tensor.cuda)
-register_elementwise_op(Tensor.deg2rad)
-register_elementwise_op(Tensor.detach)
-register_elementwise_op(Tensor.digamma)
-register_elementwise_op(Tensor.double)
-register_elementwise_op(Tensor.erf)
-register_elementwise_op(Tensor.erfc)
-register_elementwise_op(Tensor.erfinv)
-register_elementwise_op(Tensor.exp)
-register_elementwise_op(Tensor.expm1)
-register_elementwise_op(Tensor.fix)
-register_elementwise_op(Tensor.trunc)
-register_elementwise_op(Tensor.float)
-register_elementwise_op(Tensor.float_power)
-register_elementwise_op(Tensor.floor)
-register_elementwise_op(Tensor.frac)
-register_elementwise_op(Tensor.half)
-register_elementwise_op(Tensor.hardshrink)
-register_elementwise_op(Tensor.heaviside)
-register_elementwise_op(Tensor.i0)
-register_elementwise_op(Tensor.int)
-register_elementwise_op(Tensor.isfinite)
-register_elementwise_op(Tensor.isinf)
-register_elementwise_op(Tensor.isposinf)
-register_elementwise_op(Tensor.isneginf)
-register_elementwise_op(Tensor.isnan)
-register_elementwise_op(Tensor.lgamma)
-register_elementwise_op(Tensor.log)
-register_elementwise_op(Tensor.log10)
-register_elementwise_op(Tensor.log1p)
-register_elementwise_op(Tensor.log2)
-register_elementwise_op(Tensor.logical_not)
-register_elementwise_op(Tensor.logit)
-register_elementwise_op(Tensor.long)
-register_elementwise_op(Tensor.nan_to_num)
-register_elementwise_op(Tensor.neg)
-register_elementwise_op(Tensor.negative)
-register_elementwise_op(Tensor.positive)
-register_elementwise_op(Tensor.pow)
-register_elementwise_op(Tensor.rad2deg)
-register_elementwise_op(Tensor.reciprocal)
-register_elementwise_op(Tensor.round)
-register_elementwise_op(Tensor.rsqrt)
-register_elementwise_op(Tensor.short)
-register_elementwise_op(Tensor.sigmoid)
-register_elementwise_op(Tensor.sign)
-register_elementwise_op(Tensor.signbit)
-register_elementwise_op(Tensor.sgn)
-register_elementwise_op(Tensor.sin)
-register_elementwise_op(Tensor.sinc)
-register_elementwise_op(Tensor.sinh)
-register_elementwise_op(Tensor.asinh)
-register_elementwise_op(Tensor.arcsinh)
-register_elementwise_op(Tensor.sqrt)
-register_elementwise_op(Tensor.square)
-register_elementwise_op(Tensor.to)
-register_elementwise_op(Tensor.tan)
-register_elementwise_op(Tensor.tanh)
-register_elementwise_op(Tensor.atanh)
-register_elementwise_op(Tensor.arctanh)
-register_elementwise_op(Tensor.type)
-register_elementwise_op(Tensor.type_as)
-
-# torch OP
-register_elementwise_op(torch.abs)
-register_elementwise_op(torch.absolute)
-register_elementwise_op(torch.acos)
-register_elementwise_op(torch.arccos)
-register_elementwise_op(torch.angle)
-register_elementwise_op(torch.asin)
-register_elementwise_op(torch.arcsin)
-register_elementwise_op(torch.atan)
-register_elementwise_op(torch.arctan)
-register_elementwise_op(torch.all)
-register_elementwise_op(torch.any)
-register_elementwise_op(torch.bernoulli)
-register_elementwise_op(torch.bitwise_not)
-register_elementwise_op(torch.ceil)
-register_elementwise_op(torch.clamp)
-register_elementwise_op(torch.clamp_max)
-register_elementwise_op(torch.clamp_min)
-register_elementwise_op(torch.clip)
-register_elementwise_op(torch.clone)
-register_elementwise_op(torch.copysign)
-register_elementwise_op(torch.cos)
-register_elementwise_op(torch.cosh)
-register_elementwise_op(torch.acosh)
-register_elementwise_op(torch.arccosh)
-register_elementwise_op(torch.deg2rad)
-register_elementwise_op(torch.digamma)
-register_elementwise_op(torch.erf)
-register_elementwise_op(torch.erfc)
-register_elementwise_op(torch.erfinv)
-register_elementwise_op(torch.exp)
-register_elementwise_op(torch.expm1)
-register_elementwise_op(torch.fix)
-register_elementwise_op(torch.trunc)
-register_elementwise_op(torch.float_power)
-register_elementwise_op(torch.floor)
-register_elementwise_op(torch.frac)
-register_elementwise_op(torch.hardshrink)
-register_elementwise_op(torch.heaviside)
-register_elementwise_op(torch.i0)
-register_elementwise_op(torch.isfinite)
-register_elementwise_op(torch.isinf)
-register_elementwise_op(torch.isposinf)
-register_elementwise_op(torch.isneginf)
-register_elementwise_op(torch.isnan)
-register_elementwise_op(torch.lgamma)
-register_elementwise_op(torch.log)
-register_elementwise_op(torch.log10)
-register_elementwise_op(torch.log1p)
-register_elementwise_op(torch.log2)
-register_elementwise_op(torch.logical_not)
-register_elementwise_op(torch.logit)
-register_elementwise_op(torch.nan_to_num)
-register_elementwise_op(torch.neg)
-register_elementwise_op(torch.negative)
-register_elementwise_op(torch.positive)
-register_elementwise_op(torch.pow)
-register_elementwise_op(torch.rad2deg)
-register_elementwise_op(torch.reciprocal)
-register_elementwise_op(torch.round)
-register_elementwise_op(torch.rsqrt)
-register_elementwise_op(torch.sigmoid)
-register_elementwise_op(torch.sign)
-register_elementwise_op(torch.signbit)
-register_elementwise_op(torch.sgn)
-register_elementwise_op(torch.sin)
-register_elementwise_op(torch.sinc)
-register_elementwise_op(torch.sinh)
-register_elementwise_op(torch.asinh)
-register_elementwise_op(torch.arcsinh)
-register_elementwise_op(torch.sqrt)
-register_elementwise_op(torch.square)
-register_elementwise_op(torch.tan)
-register_elementwise_op(torch.tanh)
-register_elementwise_op(torch.atanh)
-register_elementwise_op(torch.arctanh)
-register_elementwise_op(torch.zeros_like)
-
-# nn.functional OP
-register_elementwise_op(F.threshold)
-register_elementwise_op(F.relu)
-register_elementwise_op(F.hardtanh)
-register_elementwise_op(F.hardswish)
-register_elementwise_op(F.relu6)
-register_elementwise_op(F.elu)
-register_elementwise_op(F.selu)
-register_elementwise_op(F.celu)
-register_elementwise_op(F.leaky_relu)
-register_elementwise_op(F.prelu)
-register_elementwise_op(F.rrelu)
-register_elementwise_op(F.gelu)
-register_elementwise_op(F.logsigmoid)
-register_elementwise_op(F.hardshrink)
-register_elementwise_op(F.tanhshrink)
-register_elementwise_op(F.softsign)
-register_elementwise_op(F.softplus)
-register_elementwise_op(F.softmin)
-register_elementwise_op(F.softmax)
-register_elementwise_op(F.softshrink)
-register_elementwise_op(F.gumbel_softmax)
-register_elementwise_op(F.log_softmax)
-register_elementwise_op(F.tanh)
-register_elementwise_op(F.sigmoid)
-register_elementwise_op(F.hardsigmoid)
-register_elementwise_op(F.silu)
-register_elementwise_op(F.mish)
-# TODO(ver217): dropout handles seed
-register_elementwise_op(F.dropout)
-register_elementwise_op(F.alpha_dropout)
-register_elementwise_op(F.feature_alpha_dropout)
diff --git a/colossalai/legacy/nn/_ops/embedding.py b/colossalai/legacy/nn/_ops/embedding.py
deleted file mode 100644
index b145d1763..000000000
--- a/colossalai/legacy/nn/_ops/embedding.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from typing import Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
-
-
-def colo_embedding_1Dcol(input_tensor: ColoTensor,
-                         weight: ColoTensor,
-                         padding_idx: Optional[int] = None,
-                         max_norm: Optional[float] = None,
-                         norm_type: float = 2.0,
-                         scale_grad_by_freq: bool = False,
-                         sparse: bool = False) -> ColoTensor:
-    # embedding_1Dcol split the weight(lookup table) to (num_embeddings, embedding_dim/P)
-    # Gather splitted lookup table
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    output_parallel = F.embedding(input_tensor,
-                                  weight,
-                                  padding_idx=padding_idx,
-                                  max_norm=max_norm,
-                                  norm_type=norm_type,
-                                  scale_grad_by_freq=scale_grad_by_freq,
-                                  sparse=sparse)
-    output_spec = ColoTensorSpec(weight.get_process_group(), ShardSpec([-1], [weight.get_tp_world_size()]),
-                                 ComputeSpec(ComputePattern.TP1D))
-    output = ColoTensor.from_torch_tensor(output_parallel, spec=output_spec)
-
-    compute_spec = weight.compute_spec
-
-    if compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_embedding_1Drow(input_tensor: ColoTensor,
-                         weight: ColoTensor,
-                         padding_idx: Optional[int] = None,
-                         max_norm: Optional[float] = None,
-                         norm_type: float = 2.0,
-                         scale_grad_by_freq: bool = False,
-                         sparse: bool = False) -> ColoTensor:
-    # embedding_1Drow splits the weight(lookup table) to the shape, [num_embeddings/P, embedding_dim]
-    # get the index of current segment and mask other segments with 0
-
-    # get complete input tensor through all-gather
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    # tensor_parallel_rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-    tensor_parallel_rank = weight.get_process_group().tp_local_rank()
-    num_embeddings_per_partition = weight.size_local(0)
-    vocab_start_index = tensor_parallel_rank * num_embeddings_per_partition
-    vocab_end_index = vocab_start_index + num_embeddings_per_partition
-
-    # build the mask.
-    input_mask = (input_tensor < vocab_start_index) | (input_tensor >= vocab_end_index)
-    # mask the input.
-    # TODO(jzy) masked_input may be an activation managed by ColoTensor.
-    masked_input = input_tensor - vocab_start_index
-    masked_input[input_mask] = 0
-
-    partial_output = F.embedding(masked_input,
-                                 weight,
-                                 padding_idx=padding_idx,
-                                 max_norm=max_norm,
-                                 norm_type=norm_type,
-                                 scale_grad_by_freq=scale_grad_by_freq,
-                                 sparse=sparse)
-
-    # Mask the output embedding.
-    partial_output[input_mask, :] = 0.
-    # Reduce across all the model parallel GPUs.
-    output = reduce_input(partial_output, weight.get_process_group())
-    output = ColoTensor.from_torch_tensor(output, spec=ColoTensorSpec(weight.get_process_group(), ReplicaSpec()))
-    return output
-
-
-def colo_embedding_1d(mode: str,
-                      input_tensor: ColoTensor,
-                      weight: ColoTensor,
-                      padding_idx: Optional[int] = None,
-                      max_norm: Optional[float] = None,
-                      norm_type: float = 2.0,
-                      scale_grad_by_freq: bool = False,
-                      sparse: bool = False) -> ColoTensor:
-    assert mode in ('row', 'col')
-    funcs = {'row': colo_embedding_1Drow, 'col': colo_embedding_1Dcol}
-    return funcs[mode](input_tensor,
-                       weight,
-                       padding_idx=padding_idx,
-                       max_norm=max_norm,
-                       norm_type=norm_type,
-                       scale_grad_by_freq=scale_grad_by_freq,
-                       sparse=sparse)
-
-
-@colo_op_impl(F.embedding)
-def colo_embedding(input_tensor: GeneralTensor,
-                   weight: GeneralTensor,
-                   padding_idx: Optional[int] = None,
-                   max_norm: Optional[float] = None,
-                   norm_type: float = 2.0,
-                   scale_grad_by_freq: bool = False,
-                   sparse: bool = False):
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding``.
-    This method looks up an embedding table.
-    """
-    assert isinstance(weight, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
-
-    if not weight.has_compute_spec():    # No Model Parallel Applied
-        assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
-        return ColoTensor.from_torch_tensor(tensor=F.embedding(input_tensor,
-                                                               weight,
-                                                               padding_idx=padding_idx,
-                                                               max_norm=max_norm,
-                                                               norm_type=norm_type,
-                                                               scale_grad_by_freq=scale_grad_by_freq,
-                                                               sparse=sparse),
-                                            spec=ColoTensorSpec(weight.get_process_group()))
-    elif weight.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if weight.is_shard_1drow():
-            mode = 'row'
-        elif weight.is_shard_1dcol():
-            mode = 'col'
-        else:
-            raise NotImplementedError
-        return colo_embedding_1d(mode,
-                                 input_tensor,
-                                 weight,
-                                 padding_idx=padding_idx,
-                                 max_norm=max_norm,
-                                 norm_type=norm_type,
-                                 scale_grad_by_freq=scale_grad_by_freq,
-                                 sparse=sparse)
-    else:
-        raise NotImplementedError
diff --git a/colossalai/legacy/nn/_ops/embedding_bag.py b/colossalai/legacy/nn/_ops/embedding_bag.py
deleted file mode 100644
index 9a656d587..000000000
--- a/colossalai/legacy/nn/_ops/embedding_bag.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from typing import Optional
-
-import torch.nn.functional as F
-from torch import Tensor
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-def colo_embedding_bag_1Dcol(input_tensor: ColoTensor,
-                             weight: ColoTensor,
-                             offsets: Optional[Tensor] = None,
-                             max_norm: Optional[float] = None,
-                             norm_type: float = 2,
-                             scale_grad_by_freq: bool = False,
-                             mode: str = "mean",
-                             sparse: bool = False,
-                             per_sample_weights: Optional[Tensor] = None,
-                             include_last_offset: bool = False,
-                             padding_idx: Optional[int] = None) -> ColoTensor:
-    # embedding_bag_1Dcol split the weight(lookup table) to (num_embeddings, embedding_dim/P)
-    # Gather splitted lookup table
-    pg = weight.get_process_group()
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    output_parallel = F.embedding_bag(input_tensor,
-                                      weight,
-                                      offsets=offsets,
-                                      max_norm=max_norm,
-                                      norm_type=norm_type,
-                                      scale_grad_by_freq=scale_grad_by_freq,
-                                      mode=mode,
-                                      sparse=sparse,
-                                      per_sample_weights=per_sample_weights,
-                                      include_last_offset=include_last_offset,
-                                      padding_idx=padding_idx)
-    output_spec = ColoTensorSpec(pg, ShardSpec([-1], [weight.get_tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    output = ColoTensor.from_torch_tensor(output_parallel, spec=output_spec)
-
-    if weight.compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_embedding_bag_1d(tp_mode: str,
-                          input_tensor: ColoTensor,
-                          weight: ColoTensor,
-                          offsets: Optional[Tensor] = None,
-                          max_norm: Optional[float] = None,
-                          norm_type: float = 2,
-                          scale_grad_by_freq: bool = False,
-                          mode: str = "mean",
-                          sparse: bool = False,
-                          per_sample_weights: Optional[Tensor] = None,
-                          include_last_offset: bool = False,
-                          padding_idx: Optional[int] = None) -> ColoTensor:
-    assert tp_mode in ('col',)
-    funcs = {'col': colo_embedding_bag_1Dcol}
-    return funcs[tp_mode](input_tensor,
-                          weight,
-                          offsets=offsets,
-                          max_norm=max_norm,
-                          norm_type=norm_type,
-                          scale_grad_by_freq=scale_grad_by_freq,
-                          mode=mode,
-                          sparse=sparse,
-                          per_sample_weights=per_sample_weights,
-                          include_last_offset=include_last_offset,
-                          padding_idx=padding_idx)
-
-
-@colo_op_impl(F.embedding_bag)
-def colo_embedding_bag(input_tensor: GeneralTensor,
-                       weight: GeneralTensor,
-                       offsets: Optional[Tensor] = None,
-                       max_norm: Optional[float] = None,
-                       norm_type: float = 2,
-                       scale_grad_by_freq: bool = False,
-                       mode: str = "mean",
-                       sparse: bool = False,
-                       per_sample_weights: Optional[Tensor] = None,
-                       include_last_offset: bool = False,
-                       padding_idx: Optional[int] = None):
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding_bag``.
-    This method looks up an embedding table.
-    """
-    assert isinstance(weight, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
-
-    # Handle different parallel actions.
-
-    if not weight.has_compute_spec():    # No Model Parallel Applied
-        assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
-        return ColoTensor.from_torch_tensor(tensor=F.embedding_bag(input_tensor,
-                                                                   weight,
-                                                                   offsets=offsets,
-                                                                   max_norm=max_norm,
-                                                                   norm_type=norm_type,
-                                                                   scale_grad_by_freq=scale_grad_by_freq,
-                                                                   mode=mode,
-                                                                   sparse=sparse,
-                                                                   per_sample_weights=per_sample_weights,
-                                                                   include_last_offset=include_last_offset,
-                                                                   padding_idx=padding_idx),
-                                            spec=ColoTensorSpec(weight.get_process_group()))
-    elif weight.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if weight.is_shard_1dcol():
-            tp_mode = 'col'
-        else:
-            raise NotImplementedError
-        return colo_embedding_bag_1d(tp_mode,
-                                     input_tensor,
-                                     weight,
-                                     offsets=offsets,
-                                     max_norm=max_norm,
-                                     norm_type=norm_type,
-                                     scale_grad_by_freq=scale_grad_by_freq,
-                                     mode=mode,
-                                     sparse=sparse,
-                                     per_sample_weights=per_sample_weights,
-                                     include_last_offset=include_last_offset,
-                                     padding_idx=padding_idx)
-    else:
-        raise NotImplementedError
diff --git a/colossalai/legacy/nn/_ops/layernorm.py b/colossalai/legacy/nn/_ops/layernorm.py
deleted file mode 100644
index 9960c5d48..000000000
--- a/colossalai/legacy/nn/_ops/layernorm.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import List, Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-@colo_op_impl(F.layer_norm)
-def colo_layernorm(
-    input_tensor: GeneralTensor,
-    normalized_shape: List[int],
-    weight: Optional[GeneralTensor] = None,
-    bias: Optional[GeneralTensor] = None,
-    eps: float = 1e-5,
-):
-    assert isinstance(weight, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
-    bias = convert_to_colo_tensor(bias, weight.get_process_group())
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    output = F.layer_norm(input_tensor, normalized_shape, weight=weight, bias=bias, eps=eps)
-    output = ColoTensor.from_torch_tensor(tensor=output,
-                                          spec=ColoTensorSpec(pg=input_tensor.get_process_group(),
-                                                              dist_attr=input_tensor.dist_spec))
-    return output
diff --git a/colossalai/legacy/nn/_ops/linear.py b/colossalai/legacy/nn/_ops/linear.py
deleted file mode 100644
index 2f2088c61..000000000
--- a/colossalai/legacy/nn/_ops/linear.py
+++ /dev/null
@@ -1,171 +0,0 @@
-from copy import deepcopy
-from typing import Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_grad, reduce_input
-
-
-def colo_linear_1drow(input_tensor: ColoTensor, weight: ColoTensor, bias: Optional[ColoTensor]) -> 'ColoTensor':
-    # Input:S[1] x Weight:S[0] = Output:P
-    # All-Reduce(Output) + bias = res
-    # Input:S[1]
-    pg = weight.get_process_group()
-    input_tensor = input_tensor.redistribute(ShardSpec([-1], [weight.get_tp_world_size()]), pg)
-
-    # Output:P
-    partial_output = F.linear(input_tensor, weight)
-    # Reduce(Output)
-
-    output = reduce_input(partial_output, pg)
-    # Bias
-    if bias is not None:
-        assert not bias.has_compute_spec(), 'Invalid bias spec for 1Drow Linear op'
-        output = output + bias
-
-    output = ColoTensor.from_torch_tensor(output, spec=ColoTensorSpec(pg, ReplicaSpec()))
-    return output
-
-
-def colo_linear_1dcol(input_tensor: ColoTensor, weight: ColoTensor, bias: Optional[ColoTensor]) -> 'ColoTensor':
-    # Input:B x Weight:S[1] + Bias:S[1] = Output:S[1]
-    # All-Gather(Output)
-    # Input:B
-    compute_spec = weight.compute_spec
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-    input_parallel = reduce_grad(input_tensor, weight.get_process_group())
-
-    output_parallel = F.linear(input_parallel, weight, bias)
-    output = ColoTensor.from_torch_tensor(output_parallel,
-                                          spec=ColoTensorSpec(weight.get_process_group(),
-                                                              ShardSpec([-1], [weight.get_tp_world_size()]),
-                                                              ComputeSpec(ComputePattern.TP1D)))
-    if compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_linear_1d(mode: str, input_tensor: ColoTensor, weight: ColoTensor, bias: Optional[ColoTensor]) -> 'ColoTensor':
-    assert mode in ('row', 'col')
-    funcs = {'row': colo_linear_1drow, 'col': colo_linear_1dcol}
-    return funcs[mode](input_tensor, weight, bias)
-
-
-# @register_colo_graph(input_pos=[1], param_pos=[2, 3])
-def colo_linear_imp(input_tensor: GeneralTensor,
-                    weight: GeneralTensor,
-                    bias: Optional[GeneralTensor] = None) -> 'ColoTensor':
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.linear``.
-    This method computes a linear.
-    """
-    assert isinstance(weight, ColoTensor)
-    pg = weight.get_process_group()
-    assert pg
-    input_tensor = convert_to_colo_tensor(input_tensor, pg)
-    bias = convert_to_colo_tensor(bias, pg)
-    # input_tensor, weight, bias = tuple(map(convert_to_colo_tensor, (input_tensor, weight, bias)))
-
-    # Add communication logic before and after linear call.
-    ret_tensor = None
-    if not weight.has_compute_spec():    # No Model Parallel Applied
-        assert weight.is_replicate(), 'Invalid weight spec for native Linear op'
-        assert bias is None or bias.is_replicate(), 'Invalid bias spec for native Linear op'
-        ret_tensor = ColoTensor.from_torch_tensor(F.linear(input_tensor, weight, bias), spec=ColoTensorSpec(pg))
-    elif weight.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if weight.is_shard_1dcol() and (bias is None or bias.is_replicate()):
-            mode = 'row'
-        elif weight.is_shard_1drow() and (bias is None or bias.is_shard_1drow() or bias.is_shard_1dcol()):
-            mode = 'col'
-        else:
-            raise RuntimeError(f"the weight or bias tensor spec is not valid, weight {weight}, bias {bias}")
-        ret_tensor = colo_linear_1d(mode, input_tensor, weight, bias)
-    else:
-        raise NotImplementedError
-
-    return ret_tensor
-
-
-def _new_colo_linear_imp(input_tensor: GeneralTensor,
-                         weight: GeneralTensor,
-                         bias: Optional[GeneralTensor] = None) -> 'ColoTensor':
-    """
-    A tentative function to compute the distributed linear layer with the latest sharding spec.
-    This function is subject to future change as the current sharding API is not stable.
-    """
-    # get mesh info
-    input_sharding_seq = input_tensor.sharding_spec.sharding_sequence
-    weight_sharding_seq = weight.sharding_spec.sharding_sequence
-    if bias is not None:
-        bias_sharding_seq = bias.sharding_spec.sharding_sequence
-    device_mesh = weight.sharding_spec.device_mesh
-    pg_axis0 = weight.pg_axis0
-    pg_axis1 = weight.pg_axis1
-
-    # the last dim of input should have the same spec as the first dim of weight
-    # the weight is transposed, so we look at the second dimension
-    assert input_sharding_seq[-1] == weight_sharding_seq[1]
-
-    if bias is not None:
-        assert bias_sharding_seq[0] == weight_sharding_seq[0]
-
-    # compute the output sharding sequence
-    # as weight is transposed, so we look at the first dimension
-    output_shard_seq = input_sharding_seq[:-1] + weight_sharding_seq[:1]
-    output_shard_seq = deepcopy(output_shard_seq)
-
-    # TODO: add reduce grad logic
-
-    # handle column and row parallel linear
-    # by reusing the implementation above
-    out = F.linear(input_tensor, weight)
-
-    # run all reduce if necessary
-    last_dim_spec = input_sharding_seq[-1]
-    if last_dim_spec.is_replica:
-        pass
-    elif last_dim_spec.shard_list is not None:
-        for dim in last_dim_spec.shard_list:
-            if dim == 0:
-                reduce_input(out, pg_axis0)
-            elif dim == 1:
-                reduce_input(out, pg_axis1)
-            else:
-                raise RuntimeError("Found invalid sharding axis {dim}, only 0 or 1 is expected")
-    # add bias
-    if bias is not None:
-        out += bias
-
-    # convert shard seq to partition dict
-    output_partition_dict = {}
-    for index, dim_spec in enumerate(output_shard_seq):
-        if not dim_spec.is_replica:
-            if index not in output_partition_dict:
-                output_partition_dict[index] = []
-            output_partition_dict[index].extend(dim_spec.shard_list)
-
-    entire_shape = out.shape
-    output_sharding_spec = ShardingSpec(device_mesh, entire_shape, output_partition_dict)
-    ret_tensor = ColoTensor.from_torch_tensor(out)
-    setattr(ret_tensor, 'sharding_spec', output_sharding_spec)
-    return ret_tensor
-
-
-def _has_sharding_spec(tensor):
-    """
-    A tentative function to check whether the tensor is using the new sharding spec API. We assume that the sharding spec object is
-    set as the attribute `sharding_spec` on a tensor.
-    """
-    return hasattr(tensor, 'sharding_spec')
-
-
-@colo_op_impl(F.linear)
-def colo_linear(input: GeneralTensor, weight: GeneralTensor, bias: Optional[GeneralTensor] = None) -> 'ColoTensor':
-    if _has_sharding_spec(weight):
-        return _new_colo_linear_imp(input, weight, bias)
-    else:
-        return colo_linear_imp(input, weight, bias)
diff --git a/colossalai/legacy/nn/_ops/loss.py b/colossalai/legacy/nn/_ops/loss.py
deleted file mode 100644
index 90efbfa36..000000000
--- a/colossalai/legacy/nn/_ops/loss.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-
-from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
-from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-@colo_op_impl(F.cross_entropy)
-def colo_cross_entropy(input_tensor: GeneralTensor,
-                       target: GeneralTensor,
-                       weight: Optional[GeneralTensor] = None,
-                       size_average: Optional[bool] = None,
-                       ignore_index: int = -100,
-                       reduce: Optional[bool] = None,
-                       reduction: str = "mean",
-                       label_smoothing: float = 0.0):
-    assert isinstance(weight, ColoTensor) or isinstance(target, ColoTensor) or isinstance(input_tensor, ColoTensor)
-    pg = input_tensor.get_process_group() if isinstance(input_tensor, ColoTensor) else isinstance(target, ColoTensor)
-    weight = convert_to_colo_tensor(weight, pg)
-    target = convert_to_colo_tensor(target, pg)
-    input_tensor = convert_to_colo_tensor(input_tensor, pg)
-
-    if input_tensor.is_replicate():    # Input is gathered
-        assert target.is_replicate() and (weight is None or weight.is_replicate()), \
-            "Target tensor and weight tensor both should be complete"
-        output = F.cross_entropy(input_tensor,
-                                 target,
-                                 weight=weight,
-                                 size_average=size_average,
-                                 ignore_index=ignore_index,
-                                 reduce=reduce,
-                                 reduction=reduction,
-                                 label_smoothing=label_smoothing)
-        return ColoTensor.from_torch_tensor(output, ColoTensorSpec(pg))
-    elif input_tensor.has_compute_spec():    # Single Model Parallel Applied
-        if input_tensor.is_shard_1dcol():
-            assert weight is None, "Current TP cross entropy loss function doesn't support passing weight tensor in"
-            assert target.is_replicate(), "Target tensor should be complete in TP cross entropy loss function"
-            output = VocabParallelCrossEntropyLoss1D()(input_tensor,
-                                                       target,
-                                                       process_group=input_tensor.process_group.tp_process_group())
-            return ColoTensor.from_torch_tensor(output, ColoTensorSpec(pg))
-        else:
-            raise NotImplementedError
-    else:
-        raise NotImplementedError
diff --git a/colossalai/legacy/nn/_ops/view.py b/colossalai/legacy/nn/_ops/view.py
deleted file mode 100644
index 3c0bc5233..000000000
--- a/colossalai/legacy/nn/_ops/view.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import operator
-from functools import reduce
-from typing import Optional, Union
-
-import torch
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-
-def _all_int(my_iter):
-    return all(isinstance(i, int) for i in my_iter)
-
-
-def _get_valid_shape(shape):
-    if isinstance(shape, list):
-        if _all_int(shape):
-            return tuple(shape)
-        else:
-            raise RuntimeError("expects type(int) but finds an other type")
-    elif isinstance(shape, tuple):
-        if _all_int(shape):
-            return shape
-        else:
-            return _get_valid_shape(shape[0])
-    else:
-        raise RuntimeError("expects an iterable array but finds '{}'".format(type(shape)))
-
-
-def _shape_infer(org_sp, tgt_sp):
-    cnt = 0
-    pos = 0
-    for idx, dim in enumerate(tgt_sp):
-        if dim < -1:
-            raise RuntimeError("invalid shape dimension {}".format(dim))
-        elif dim == -1:
-            cnt += 1
-            pos = idx
-
-    if cnt > 1:
-        raise RuntimeError("only one dimension can be inferred")
-
-    org_prod = reduce(operator.mul, org_sp, 1)
-    tgt_prod = reduce(operator.mul, tgt_sp, 1)
-
-    if cnt == 0:
-        if org_prod != tgt_prod:
-            raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
-        else:
-            return tgt_sp
-    elif org_prod % tgt_prod != 0:
-        raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
-
-    infer_dim = -(org_prod // tgt_prod)
-    return tgt_sp[:pos] + (infer_dim,) + tgt_sp[pos + 1:]
-
-
-@colo_op_impl(torch.Tensor.view)
-def colo_view(self: ColoTensor, *shape) -> 'ColoTensor':
-    """Handles ``__torch_function__`` dispatch for ``torch.Tensor.view``.
-    Changes the shape of the current tensor.
-    """
-    assert isinstance(self, ColoTensor)
-    # apply original `view` function for replicated colo tensors
-    if self.is_replicate():
-        return self.view(*shape)
-
-    cur_sp = self.size()
-    org_sp = self.size_global()
-    # parse the passed arguments
-    tgt_sp = _get_valid_shape(shape)
-    # get the correct shape from inference
-    inf_sp = _shape_infer(org_sp, tgt_sp)
-
-    if self.is_shard_1drow() and org_sp[0] == inf_sp[0]:
-        new_shape = (cur_sp[0],) + tgt_sp[1:]
-        res = self.view(*new_shape)
-    elif self.is_shard_1dcol() and org_sp[-1] == inf_sp[-1]:
-        new_shape = tgt_sp[:-1] + (cur_sp[-1],)
-        res = self.view(*new_shape)
-    else:
-        replicated_t = self.redistribute(dist_spec=ReplicaSpec())
-        return ColoTensor.from_torch_tensor(tensor=replicated_t.view(*shape),
-                                            spec=ColoTensorSpec(self.get_process_group()))
-
-    return ColoTensor.from_torch_tensor(tensor=res,
-                                        spec=ColoTensorSpec(pg=self.get_process_group(), dist_attr=self.dist_spec))
-
-
-@colo_op_impl(torch.Tensor.size)
-def colo_size(self: ColoTensor, dim: Optional[int] = None) -> Union[torch.Size, int]:
-    size = self.size_global()
-    if dim is None:
-        return size
-    else:
-        return size[dim]
diff --git a/colossalai/legacy/nn/layer/base_layer.py b/colossalai/legacy/nn/layer/base_layer.py
index 4a06bdcb7..01fd9b3e8 100644
--- a/colossalai/legacy/nn/layer/base_layer.py
+++ b/colossalai/legacy/nn/layer/base_layer.py
@@ -5,8 +5,8 @@ from contextlib import contextmanager
 
 import torch.nn as nn
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 
 class ParallelLayer(nn.Module):
diff --git a/colossalai/legacy/nn/layer/colossalai_layer/dropout.py b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
index 0c049cb3f..7b0481a3f 100644
--- a/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 
-from colossalai.context import ParallelMode, seed
+from colossalai.legacy.context import ParallelMode, seed
 
 from ..parallel_1d import *
 from ..utils import get_tensor_parallel_mode
diff --git a/colossalai/legacy/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
index 300baf9c1..db9dfa366 100644
--- a/colossalai/legacy/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
@@ -1,7 +1,7 @@
 import torch
 import torch.distributed as dist
 
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 
 try:
     import fused_mix_prec_layer_norm_cuda
diff --git a/colossalai/legacy/nn/layer/parallel_1d/_utils.py b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
index fddf4e73d..15b41e305 100644
--- a/colossalai/legacy/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
@@ -4,8 +4,8 @@
 import torch
 import torch.distributed as dist
 
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 
 from ..utils import divide
 
diff --git a/colossalai/legacy/nn/layer/parallel_1d/layers.py b/colossalai/legacy/nn/layer/parallel_1d/layers.py
index c0a169c15..db7986b8e 100644
--- a/colossalai/legacy/nn/layer/parallel_1d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/layers.py
@@ -10,18 +10,18 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn.parameter import Parameter
 
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
 from colossalai.legacy.communication import broadcast
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.context.parallel_context import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn import init as init
-from colossalai.utils.checkpointing import (
+from colossalai.legacy.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
     partition_tensor_parallel_state_dict,
 )
+from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device
 
 from ..base_layer import ParallelLayer
diff --git a/colossalai/legacy/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
index fa9b49bcf..43e14d4a4 100644
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@@ -5,10 +5,10 @@ import torch.distributed as dist
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.utils import get_current_device
 
 
@@ -31,9 +31,9 @@ def matmul_2d(
         out_shape (:class:`torch.size`): shape of output tensor.
         row_rank (int, optional): the rank of row, defaults to None.
         col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`, optional):
             row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`, optional):
             column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
 
     Returns:
@@ -146,8 +146,8 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
         out_shape (:class:`torch.size`): shape of output tensor.
         row_rank (int, optional): the rank of row, defaults to None.
         col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -172,8 +172,8 @@ class Matmul_AB_2D(torch.autograd.Function):
         out_shape (:class:`torch.size`): shape of output tensor.
         row_rank (int, optional): the rank of row, defaults to None.
         col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -299,8 +299,8 @@ class Matmul_ABT_2D(torch.autograd.Function):
         out_shape (:class:`torch.size`): shape of output tensor.
         row_rank (int, optional): the rank of row, defaults to None.
         col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
             column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
@@ -433,8 +433,8 @@ class Matmul_ATB_2D(torch.autograd.Function):
         out_shape (:class:`torch.size`): shape of output tensor.
         row_rank (int, optional): the rank of row, defaults to None.
         col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -620,8 +620,8 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
         output_size_per_partition (int): size of output per partition.
         row_rank (int, optional): the rank of row, defaults to None.
         col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         skip_bias_add (bool):
             If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
         data_parallel_rank (int): data parallel rank.
@@ -685,8 +685,8 @@ def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, r
         E_x (:class:`torch.tensor`): mean.
         Var_x (:class:`torch.tensor`): variance.
         hidden_size (int): hidden size.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -719,7 +719,7 @@ def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode)
     Args:
         tensor (:class:`torch.tensor`): Input tensor.
         dim (int): Dimension to gather.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -767,7 +767,7 @@ def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
 
     Args:
         input_ (:class:`torch.tensor`): Input tensor.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -795,7 +795,7 @@ def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMo
     Args:
         tensor (:class:`torch.tensor`): Input tensor.
         dim (int): Dimension to reduce.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
diff --git a/colossalai/legacy/nn/layer/parallel_2d/_utils.py b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
index 012fec41c..87ba1bf69 100644
--- a/colossalai/legacy/nn/layer/parallel_2d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
@@ -1,6 +1,6 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 
 
 def get_summa_dim_from_env() -> int:
diff --git a/colossalai/legacy/nn/layer/parallel_2d/layers.py b/colossalai/legacy/nn/layer/parallel_2d/layers.py
index b458d15c7..893bc74b5 100644
--- a/colossalai/legacy/nn/layer/parallel_2d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/layers.py
@@ -8,13 +8,16 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import broadcast
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import LAYERS
+from colossalai.legacy.utils.checkpointing import (
+    gather_tensor_parallel_state_dict,
+    partition_tensor_parallel_state_dict,
+)
 from colossalai.nn import init as init
-from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
 from colossalai.utils.cuda import get_current_device
 
 from ..base_layer import ParallelLayer
diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
index 55defa4a3..1226162ae 100644
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@@ -5,9 +5,9 @@ import torch.distributed as dist
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device
 
 
@@ -112,8 +112,8 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
         out_shape (:class:`torch.size`): shape of output tensor.
         row_rank (int): the rank of row.
         col_rank (int): the rank of column.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -139,8 +139,8 @@ class Matmul_AB_2p5D(torch.autograd.Function):
         row_rank (int): the rank of row.
         col_rank (int): the rank of column.
         dep_rank (int): the rank of depth.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -264,8 +264,8 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
         row_rank (int): the rank of row.
         col_rank (int): the rank of column.
         dep_rank (int): the rank of depth.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -394,8 +394,8 @@ class Matmul_ATB_2p5D(torch.autograd.Function):
         row_rank (int): the rank of row.
         col_rank (int): the rank of column.
         dep_rank (int): the rank of depth.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         data_parallel_rank (int): data parallel rank.
         pipeline_parallel_rank (int): pipeline parallel rank
         pipeline_parallel_size (int): pipeline parallel size.
@@ -606,7 +606,7 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
         row_rank (int): the rank of row.
         col_rank (int): the rank of column.
         dep_rank (int): the rank of depth.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
         skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
             which is preserved for kernel fusion.
         data_parallel_rank (int): data parallel rank.
@@ -631,7 +631,7 @@ class _Layernorm2p5D(torch.autograd.Function):
         E_x (:class:`torch.tensor`): mean.
         Var_x (:class:`torch.tensor`): variance.
         hidden_size (int): hidden size.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -682,7 +682,7 @@ def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
         E_x (:class:`torch.tensor`): mean.
         Var_x (:class:`torch.tensor`): variance.
         hidden_size (int): hidden size.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -715,7 +715,7 @@ def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: Parallel
     Args:
         inputs (:class:`torch.tensor`): input tensor.
         dim (int): dimension of all-gather.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -730,7 +730,7 @@ class SplitFirst(torch.autograd.Function):
     Args:
         inputs (:class:`torch.tensor`): input tensor.
         tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -798,7 +798,7 @@ def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:
 
     Args:
         input_ (:class:`torch.tensor`): Input tensor.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -826,7 +826,7 @@ def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: Parallel
     Args:
         input_ (:class:`torch.tensor`): Input tensor.
         dim (int): Dimension to reduce.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
index 1478b25de..69a350a97 100644
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
@@ -1,6 +1,6 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 
 
 def get_tesseract_dim_dep_from_env():
diff --git a/colossalai/legacy/nn/layer/parallel_2p5d/layers.py b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
index 04acc2bb0..b4aa9f16d 100644
--- a/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
@@ -8,17 +8,17 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import broadcast
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn import init as init
-from colossalai.utils.checkpointing import (
+from colossalai.legacy.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
     partition_tensor_parallel_state_dict,
 )
+from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device
 
 from ..base_layer import ParallelLayer
diff --git a/colossalai/legacy/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
index ca0b0e627..c6374efb7 100755
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@@ -7,10 +7,10 @@ import torch
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
 
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
+from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 from ._utils import get_parallel_mode_from_env, push_async_grad
 
@@ -73,9 +73,9 @@ def linear_3d(
     Args:
         input_ (:class:`torch.tensor`): input matrix.
         weight (:class:`torch.tensor`): matrix of weight.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -166,9 +166,9 @@ def classifier_3d(
         input_ (:class:`torch.tensor`): input matrix.
         weight (:class:`torch.tensor`): matrix of weight.
         bias (:class:`torch.tensor`): matrix of bias.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -260,9 +260,9 @@ def vocab_parallel_classifier_3d(
         input_ (:class:`torch.tensor`): input matrix.
         weight (:class:`torch.tensor`): matrix of weight.
         bias (:class:`torch.tensor`): matrix of bias.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -378,8 +378,8 @@ def layernorm_3d(
             If a single integer is used, it is treated as a singleton list, and this module will
             normalize over the last dimension which is expected to be of that specific size.
         eps (float): a value added to the denominator for numerical stability
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
-        input_x_weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input x weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_x_weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input x weight parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -404,7 +404,7 @@ def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Te
     Args:
         tensor (:class:`torch.tensor`): Input tensor.
         dim (int): Specified dimension in which to split.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): Parallel mode.
 
     Returns:
         :class:`torch.tensor`: The tensor has been split.
@@ -434,8 +434,8 @@ def split_batch_3d(input_: Tensor,
     Args:
         input_ (:class:`torch.tensor`): Input tensor.
         dim (int): Specified dimension in which to split.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
 
     Returns:
         :class:`torch.tensor`: The tensor has been split.
@@ -471,7 +471,7 @@ def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:
 
     Args:
         tensor (:class:`torch.tensor`): Input tensor.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -501,7 +501,7 @@ def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode)
     Args:
         tensor (:class:`torch.tensor`): Input tensor.
         dim (int): Dimension to gather.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -530,7 +530,7 @@ def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMo
     Args:
         tensor (:class:`torch.tensor`): Input tensor.
         dim (int): Dimension to scatter.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -578,8 +578,8 @@ def reduce_by_batch_3d(tensor: Tensor,
     r"""All-reduce the input from the model parallel region.
 
     Args:
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
         reduce_mean (bool, optional): If set to ``True``, it will divide the output by
             (input parallel size * weight parallel size), default to False.
 
diff --git a/colossalai/legacy/nn/layer/parallel_3d/_utils.py b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
index 364191a79..cb300c2a9 100644
--- a/colossalai/legacy/nn/layer/parallel_3d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
@@ -4,9 +4,15 @@ from functools import partial
 import torch
 from torch import Tensor
 
-from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.constants import (
+    INPUT_GROUP_3D,
+    INPUT_X_WEIGHT_3D,
+    OUTPUT_GROUP_3D,
+    OUTPUT_X_WEIGHT_3D,
+    WEIGHT_GROUP_3D,
+)
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 
 
 def get_depth_from_env() -> int:
diff --git a/colossalai/legacy/nn/layer/parallel_3d/layers.py b/colossalai/legacy/nn/layer/parallel_3d/layers.py
index b815a842c..d6aaa427b 100644
--- a/colossalai/legacy/nn/layer/parallel_3d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py
@@ -8,19 +8,25 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 
-from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import all_reduce, broadcast
+from colossalai.legacy.constants import (
+    INPUT_GROUP_3D,
+    INPUT_X_WEIGHT_3D,
+    OUTPUT_GROUP_3D,
+    OUTPUT_X_WEIGHT_3D,
+    WEIGHT_GROUP_3D,
+)
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn import init as init
-from colossalai.utils.checkpointing import (
+from colossalai.legacy.utils.checkpointing import (
     broadcast_state_dict,
     gather_tensor_parallel_state_dict,
     partition_tensor_parallel_state_dict,
 )
+from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device
 
 from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
diff --git a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
index fcf296201..ea1863f0b 100644
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@@ -5,9 +5,9 @@ import torch
 from torch import distributed as dist
 from torch.cuda.amp import custom_bwd, custom_fwd
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import ring_forward
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
 from colossalai.utils import get_current_device
 
diff --git a/colossalai/legacy/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
index e44e61c2f..033c1be96 100644
--- a/colossalai/legacy/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@@ -9,11 +9,11 @@ import torch.nn.functional as F
 from torch.nn import Parameter
 
 import colossalai
-from colossalai.context import seed
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.kernel import FusedScaleMaskSoftmax
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.context import seed
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
 from colossalai.legacy.registry import LAYERS
 
diff --git a/colossalai/legacy/nn/layer/utils/common.py b/colossalai/legacy/nn/layer/utils/common.py
index d8f3ad2a7..3148a0bed 100644
--- a/colossalai/legacy/nn/layer/utils/common.py
+++ b/colossalai/legacy/nn/layer/utils/common.py
@@ -8,9 +8,9 @@ import numpy as np
 import torch
 from torch import Tensor, nn
 
-from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.utils import checkpoint
+from colossalai.legacy.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
+from colossalai.legacy.global_variables import tensor_parallel_env as env
+from colossalai.legacy.utils import checkpoint
 
 
 class CheckpointModule(nn.Module):
diff --git a/colossalai/legacy/nn/layer/vanilla/layers.py b/colossalai/legacy/nn/layer/vanilla/layers.py
index 0e11fc4d0..71ca1d421 100644
--- a/colossalai/legacy/nn/layer/vanilla/layers.py
+++ b/colossalai/legacy/nn/layer/vanilla/layers.py
@@ -7,7 +7,7 @@ from torch import Tensor
 from torch import nn as nn
 from torch.nn.parameter import Parameter
 
-from colossalai.context import seed
+from colossalai.legacy.context import seed
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device
@@ -64,7 +64,7 @@ class WrappedDropout(nn.Module):
     Args:
         p (float, optional): probability of an element to be zeroed, defaults 0.5.
         inplace (bool, optional): whether to do dropout in-place, default to be False.
-        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -101,7 +101,7 @@ class WrappedDropPath(nn.Module):
 
     Args:
         p (float, optional): probability of dropping path, defaults 0.0.
-        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
 
     Note:
         The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
diff --git a/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
index 68fea8622..ec19d1b70 100644
--- a/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
@@ -3,8 +3,8 @@ from typing import List, Tuple, Union
 import torch.distributed as dist
 import torch.nn as nn
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 
 class PipelineSharedModuleWrapper:
diff --git a/colossalai/legacy/nn/loss/__init__.py b/colossalai/legacy/nn/loss/__init__.py
index 1bd8872d9..abb7ec3ef 100644
--- a/colossalai/legacy/nn/loss/__init__.py
+++ b/colossalai/legacy/nn/loss/__init__.py
@@ -2,7 +2,7 @@ from torch import nn
 from torch.nn.modules.loss import *
 from torch.nn.modules.loss import _Loss
 
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
 
 from .loss_1d import VocabParallelCrossEntropyLoss1D
diff --git a/colossalai/legacy/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py
index 8c9483fcc..2582e8b35 100644
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
@@ -3,8 +3,8 @@ import torch.distributed as dist
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.modules.loss import _Loss
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import LOSSES
 
 
diff --git a/colossalai/legacy/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py
index 6191602b7..7ab584156 100644
--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
diff --git a/colossalai/legacy/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py
index 2746b2011..8a5d04a8c 100644
--- a/colossalai/legacy/nn/loss/loss_2p5d.py
+++ b/colossalai/legacy/nn/loss/loss_2p5d.py
@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
 from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.legacy.registry import LOSSES
diff --git a/colossalai/legacy/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py
index 2aeb1bd98..a576d84f7 100644
--- a/colossalai/legacy/nn/loss/loss_3d.py
+++ b/colossalai/legacy/nn/loss/loss_3d.py
@@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
-from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.core import global_context as gpc
+from colossalai.legacy.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
 from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.legacy.registry import LOSSES
diff --git a/colossalai/legacy/nn/metric/accuracy_3d.py b/colossalai/legacy/nn/metric/accuracy_3d.py
index 1aaac73ec..675f5c2b5 100644
--- a/colossalai/legacy/nn/metric/accuracy_3d.py
+++ b/colossalai/legacy/nn/metric/accuracy_3d.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 
-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
 from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 
diff --git a/colossalai/legacy/nn/parallel/data_parallel.py b/colossalai/legacy/nn/parallel/data_parallel.py
index f839d6b28..2b2ad36a7 100644
--- a/colossalai/legacy/nn/parallel/data_parallel.py
+++ b/colossalai/legacy/nn/parallel/data_parallel.py
@@ -5,7 +5,7 @@ from typing import Iterable, Optional, Set
 import torch
 import torch.distributed as dist
 
-from colossalai.tensor import ProcessGroup as ColoProcessGroup
+from colossalai.legacy.tensor import ProcessGroup as ColoProcessGroup
 from colossalai.utils import is_ddp_ignored
 
 from .reducer import Reducer
@@ -34,8 +34,8 @@ class ColoDDP(torch.nn.Module):
     """Distributed data parallel for ColoTensor. Nested ColoDDP is not supported now.
 
     Example:
-        >>> from colossalai.core import global_context as gpc
-        >>> from colossalai.context import ParallelMode
+        >>> from colossalai.legacy.core import global_context as gpc
+        >>> from colossalai.legacy.context import ParallelMode
         >>> model = torch.nn.Linear(20, 1)
         >>> pg = ProcessGroup(tp_degree = world_size//2)
         >>> model = ColoDDP(model, pg)
diff --git a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
index 79d7672b2..522fb4f44 100644
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py
@@ -4,7 +4,8 @@ import torch
 import torch.nn.functional as F
 
 from colossalai.legacy.nn._ops._utils import dual_all_to_all
-from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
+from colossalai.legacy.tensor import ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
+from colossalai.tensor import ColoParameter, ColoTensor
 
 from .cache_mgr import CachedParamMgr, EvictionStrategy
 from .cached_embedding import CachedEmbeddingBag
diff --git a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
index 116d836b7..a1feda2bd 100644
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py
@@ -6,7 +6,7 @@ import torch.distributed as dist
 import torch.nn.functional as F
 
 from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
 
 from .cache_mgr import EvictionStrategy
 from .cached_embedding import CachedEmbeddingBag
diff --git a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
index 0014c784f..8017ee72b 100644
--- a/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
+++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py
@@ -7,7 +7,7 @@ import torch.nn as nn
 from torch.profiler import record_function
 
 from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
 
 from .cache_mgr import EvictionStrategy
 from .cached_embedding import CachedEmbeddingBag
diff --git a/colossalai/legacy/nn/parallel/layers/colo_module.py b/colossalai/legacy/nn/parallel/layers/colo_module.py
index a0a3eb40c..69d92afaa 100644
--- a/colossalai/legacy/nn/parallel/layers/colo_module.py
+++ b/colossalai/legacy/nn/parallel/layers/colo_module.py
@@ -1,7 +1,7 @@
 from typing import Dict, List
 
-from colossalai.tensor import ComputePattern
-from colossalai.tensor.distspec import _DistSpec
+from colossalai.legacy.tensor import ComputePattern
+from colossalai.legacy.tensor.distspec import _DistSpec
 
 
 class ColoModule(object):
diff --git a/colossalai/legacy/nn/parallel/layers/embedding.py b/colossalai/legacy/nn/parallel/layers/embedding.py
index 3e4e7ffd8..4796699fc 100644
--- a/colossalai/legacy/nn/parallel/layers/embedding.py
+++ b/colossalai/legacy/nn/parallel/layers/embedding.py
@@ -1,4 +1,4 @@
-from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+from colossalai.legacy.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
 
 from .colo_module import ColoModule
 
diff --git a/colossalai/legacy/nn/parallel/layers/linear.py b/colossalai/legacy/nn/parallel/layers/linear.py
index e391cf808..51a8d4c97 100644
--- a/colossalai/legacy/nn/parallel/layers/linear.py
+++ b/colossalai/legacy/nn/parallel/layers/linear.py
@@ -1,4 +1,4 @@
-from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
+from colossalai.legacy.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
 
 from .colo_module import ColoModule
 
diff --git a/colossalai/legacy/nn/parallel/layers/module_utils.py b/colossalai/legacy/nn/parallel/layers/module_utils.py
index 191266fa7..09326d2d6 100644
--- a/colossalai/legacy/nn/parallel/layers/module_utils.py
+++ b/colossalai/legacy/nn/parallel/layers/module_utils.py
@@ -2,7 +2,8 @@ from typing import Dict
 
 import torch
 
-from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
+from colossalai.legacy.tensor import ComputeSpec, ProcessGroup, distspec
+from colossalai.tensor import ColoParameter
 
 from . import ColoModule
 
diff --git a/colossalai/legacy/pipeline/__init__.py b/colossalai/legacy/pipeline/__init__.py
new file mode 100644
index 000000000..f36f54ac9
--- /dev/null
+++ b/colossalai/legacy/pipeline/__init__.py
@@ -0,0 +1,4 @@
+from .layer_spec import LayerSpec
+from .pipelinable import PipelinableContext, PipelinableModel
+
+__all__ = ['PipelinableModel', 'PipelinableContext', 'LayerSpec']
diff --git a/colossalai/pipeline/layer_spec.py b/colossalai/legacy/pipeline/layer_spec.py
similarity index 97%
rename from colossalai/pipeline/layer_spec.py
rename to colossalai/legacy/pipeline/layer_spec.py
index 7e9169eff..3960debd7 100644
--- a/colossalai/pipeline/layer_spec.py
+++ b/colossalai/legacy/pipeline/layer_spec.py
@@ -1,9 +1,11 @@
 import torch
+
 from colossalai.utils.model.utils import call_to_str
 
+
 class LayerSpec:
     """
-    
+
     """
 
     def __init__(self, typename, *module_args, **module_kwargs):
@@ -52,4 +54,4 @@ class LayerSpec:
         return self._param_count
 
     def reset_param_count(self):
-        self._param_count = 0
\ No newline at end of file
+        self._param_count = 0
diff --git a/colossalai/legacy/pipeline/middleware/__init__.py b/colossalai/legacy/pipeline/middleware/__init__.py
new file mode 100644
index 000000000..481741bfe
--- /dev/null
+++ b/colossalai/legacy/pipeline/middleware/__init__.py
@@ -0,0 +1,3 @@
+from .topo import Partition, PartitionInputVal, PartitionOutputVal, Topo
+
+__all__ = ['Topo', 'Partition', 'PartitionOutputVal', 'PartitionInputVal']
diff --git a/colossalai/pipeline/middleware/adaptor/__init__.py b/colossalai/legacy/pipeline/middleware/adaptor/__init__.py
similarity index 62%
rename from colossalai/pipeline/middleware/adaptor/__init__.py
rename to colossalai/legacy/pipeline/middleware/adaptor/__init__.py
index 949700a2c..0b0d36d2f 100644
--- a/colossalai/pipeline/middleware/adaptor/__init__.py
+++ b/colossalai/legacy/pipeline/middleware/adaptor/__init__.py
@@ -1,3 +1,3 @@
 from .fx import get_topology as get_fx_topology
 
-__all__ = ['get_fx_topology']
\ No newline at end of file
+__all__ = ['get_fx_topology']
diff --git a/colossalai/pipeline/middleware/adaptor/fx.py b/colossalai/legacy/pipeline/middleware/adaptor/fx.py
similarity index 92%
rename from colossalai/pipeline/middleware/adaptor/fx.py
rename to colossalai/legacy/pipeline/middleware/adaptor/fx.py
index 8437c5194..8cc40f120 100644
--- a/colossalai/pipeline/middleware/adaptor/fx.py
+++ b/colossalai/legacy/pipeline/middleware/adaptor/fx.py
@@ -1,6 +1,8 @@
-from torch.fx.graph_module import GraphModule
-from colossalai.pipeline.middleware.topo import Partition, PartitionInputVal, PartitionOutputVal, Topo
 import torch
+from torch.fx.graph_module import GraphModule
+
+from colossalai.legacy.pipeline.middleware.topo import Partition, PartitionInputVal, PartitionOutputVal, Topo
+
 
 def partition_name_to_id(partition_name, is_input=False, is_output=False):
     if is_input:
@@ -12,6 +14,7 @@ def partition_name_to_id(partition_name, is_input=False, is_output=False):
         partition_id = int(partition_name.split(prefix)[-1]) + 2
     return partition_id
 
+
 # There are two kinds of def in fx.graph
 # 1. non direct_use & non direct_def, which means the output is used by next partition with a temporary mid value.
 #    e.g. submod1 = call_module(...)
@@ -20,6 +23,8 @@ def partition_name_to_id(partition_name, is_input=False, is_output=False):
 # 2. direct_use & direct_def, which means the output is used by next partition directly.
 #    e.g. submod1 = call_module(...)
 #         submod2 = call_module(submod1, ...)
+
+
 def find_input_in_partition(node, partitions, input_partitions=None):
     p_input_val = None
     direct_def = not node.name.startswith('getitem')
@@ -45,9 +50,10 @@ def find_input_in_partition(node, partitions, input_partitions=None):
                     partition_id = partition_name_to_id(partition.name)
                     p_input_val = PartitionInputVal(partition_id=partition_id, offset=offset)
                     return p_input_val
-        
+
     return p_input_val
-        
+
+
 def find_output_in_partition(node, partitions, output_partitions=None):
     p_output_val = PartitionOutputVal()
     for user in node.users:
@@ -70,7 +76,7 @@ def find_output_in_partition(node, partitions, output_partitions=None):
                         if arg == user:
                             p_output_val.add(partition_id=partition_id, offset=i)
                             break
-        
+
         # user is output
         if output_partitions is not None:
             output_node = output_partitions[0]
@@ -84,10 +90,11 @@ def find_output_in_partition(node, partitions, output_partitions=None):
                         break
     return p_output_val
 
+
 def get_topology(gm: GraphModule):
     topo = Topo()
     topo_output_partition = Partition()
-    
+
     input_partitions = []
     partitions = []
     output_partitions = []
@@ -109,7 +116,7 @@ def get_topology(gm: GraphModule):
         topo_input_partition.add_output_val(p_output_val)
     topo.set_partitions(partition_id=0, partition=topo_input_partition)
     topo.set_input_partition_id(partition_id=0)
-    
+
     for i, partition in enumerate(partitions):
         topo_mid_partition = Partition()
         # set input for submodule
@@ -131,15 +138,16 @@ def get_topology(gm: GraphModule):
             for user in partition.users:
                 cur_node = user
                 p_output_val = find_output_in_partition(cur_node, partitions, output_partitions)
-                topo_mid_partition.add_output_val(p_output_val)  
-        topo.set_partitions(partition_id=i+2, partition=topo_mid_partition)
-        
+                topo_mid_partition.add_output_val(p_output_val)
+        topo.set_partitions(partition_id=i + 2, partition=topo_mid_partition)
+
     # set input for output_partition
     for partition in output_partitions:
         topo_output_partition = Partition()
-        torch.fx.graph.map_arg(partition.args[0], lambda n: topo_output_partition.add_input_val(
-            find_input_in_partition(n, partitions, input_partitions)))
+        torch.fx.graph.map_arg(
+            partition.args[0],
+            lambda n: topo_output_partition.add_input_val(find_input_in_partition(n, partitions, input_partitions)))
     topo.set_partitions(partition_id=1, partition=topo_output_partition)
     topo.set_output_partition_id(partition_id=1)
 
-    return topo
\ No newline at end of file
+    return topo
diff --git a/colossalai/pipeline/middleware/topo.py b/colossalai/legacy/pipeline/middleware/topo.py
similarity index 95%
rename from colossalai/pipeline/middleware/topo.py
rename to colossalai/legacy/pipeline/middleware/topo.py
index e798e2ed9..3c21cce6d 100644
--- a/colossalai/pipeline/middleware/topo.py
+++ b/colossalai/legacy/pipeline/middleware/topo.py
@@ -1,49 +1,54 @@
-from typing import Dict, List
 from dataclasses import dataclass
+from typing import Dict, List
 
 # This file includes data structure used by Pipeline Middleware.
 
+
 @dataclass
 class ValPosition:
     partition_id: int
     offset: int
-    
+
     def __str__(self) -> str:
         res = f'[partition_id:{self.partition_id},offset:{self.offset}]'
         return res
-    
+
     def __repr__(self) -> str:
         return self.__str__()
 
+
 class PartitionInputVal(object):
+
     def __init__(self, partition_id, offset) -> None:
         # every input from which partition_id and which offset
         val_pos = ValPosition(partition_id, offset)
         self._from_partition_and_offset: ValPosition = val_pos
-        
+
     def get(self):
         return self._from_partition_and_offset
-    
+
     def __str__(self) -> str:
         res = ''
         res += f'<-({self._from_partition_and_offset})'
         return res
-    
+
     def __repr__(self) -> str:
         return self.__str__()
-    
+
+
 class PartitionOutputVal(object):
+
     def __init__(self) -> None:
         # every output to which partition_id and which offset
         self._to_partition_and_offset: List[ValPosition] = []
-        
+
     def add(self, partition_id, offset):
         val_pos = ValPosition(partition_id, offset)
         self._to_partition_and_offset.append(val_pos)
-        
+
     def get(self):
         return self._to_partition_and_offset
-        
+
     def __str__(self) -> str:
         res = ''
         res += '->('
@@ -51,27 +56,29 @@ class PartitionOutputVal(object):
             res += f'{val_pos},'
         res += ')'
         return res
-    
+
     def __repr__(self) -> str:
         return self.__str__()
 
+
 class Partition(object):
+
     def __init__(self) -> None:
         self._input_vals: List[PartitionInputVal] = []
         self._output_vals: List[PartitionOutputVal] = []
-        
+
     def add_input_val(self, input_val: PartitionInputVal):
         self._input_vals.append(input_val)
-        
+
     def add_output_val(self, output_val: PartitionOutputVal):
         self._output_vals.append(output_val)
-        
+
     def get_input_vals(self):
         return self._input_vals
-    
+
     def get_output_vals(self):
         return self._output_vals
-    
+
     # get the output offsets sent to dst_partition_id
     def get_output_offsets(self, dst_partition_id):
         res = []
@@ -80,9 +87,9 @@ class Partition(object):
             for val_pos in outputs:
                 if val_pos.partition_id == dst_partition_id:
                     res.append(offset)
-            
+
         return res
-    
+
     # get all input dst partition_ids
     def get_input_partition_ids(self):
         res = []
@@ -91,7 +98,7 @@ class Partition(object):
             if val_pos.partition_id not in res:
                 res.append(val_pos.partition_id)
         return res
-    
+
     # get all output dst partition_ids
     def get_output_partition_ids(self):
         res = []
@@ -101,24 +108,25 @@ class Partition(object):
                 if val_pos.partition_id not in res:
                     res.append(val_pos.partition_id)
         return res
-        
+
     def __str__(self) -> str:
         res = ''
         res += f'  input:\n'
         res += f'    length:{len(self._input_vals)}\n'
         for i, input_val in enumerate(self._input_vals):
             res += f'    offset={i}:{input_val}\n'
-            
+
         res += f'  output:\n'
         res += f'    length:{len(self._output_vals)}\n'
         for i, output_val in enumerate(self._output_vals):
             res += f'    offset={i}:{output_val}\n'
-        
+
         return res
-    
+
     def __repr__(self) -> str:
         return self.__str__()
 
+
 # This class is a middleware between partition splitter
 # and Pipeline Scheduler. It records the graph info about
 # partition input/output and provides it to scheduler.
@@ -132,42 +140,43 @@ class Partition(object):
 #   _input_partition_id: the key represents input_partition
 #   _output_partition_id: the key represents output_partition
 class Topo(object):
+
     def __init__(self, input_partition_id=None, output_partition_id=None) -> None:
         self._partitions: Dict[int, Partition] = {}
         self._input_partition_id = input_partition_id
         self._output_partition_id = output_partition_id
-        
+
     def set_input_partition_id(self, partition_id: int):
         self._input_partition_id = partition_id
-    
+
     def set_output_partition_id(self, partition_id: int):
         self._output_partition_id = partition_id
-        
+
     def get_input_partition_id(self):
         return self._input_partition_id
-    
+
     def get_output_partition_id(self):
         return self._output_partition_id
-    
+
     def set_partitions(self, partition_id: int, partition: Partition):
         self._partitions[partition_id] = partition
-        
+
     def get_mid_partitions(self):
-        res = {} #{partition_id: Partition}
+        res = {}    #{partition_id: Partition}
         for partition_id, partition in self._partitions.items():
             if self._input_partition_id == partition_id or self._output_partition_id == partition_id:
                 continue
             res[partition_id] = partition
         return res
-    
+
     def get_mid_partition_ids(self):
         return list(self.get_mid_partitions().keys())
-    
+
     def get_input_partition(self):
         if self._input_partition_id is not None:
             return self._partitions[self._input_partition_id]
         return None
-    
+
     def get_output_partition(self):
         if self._output_partition_id is not None:
             return self._partitions[self._output_partition_id]
@@ -175,7 +184,7 @@ class Topo(object):
 
     def get_partition_by_id(self, partition_id):
         return self._partitions[partition_id]
-        
+
     def __str__(self) -> str:
         res = ''
         if len(self._partitions) == 0:
@@ -186,21 +195,20 @@ class Topo(object):
             res += '{\n'
             res += f'InputPartition:\n  partition_id={self._input_partition_id}\n{input_part}'
             res += '}\n'
-        
+
         mid_parts = self.get_mid_partitions()
         for i, (partition_id, part) in enumerate(mid_parts.items()):
             res += '{\n'
             res += f'SubPartition_{i}:\n  partition_id={partition_id}\n  {part}'
             res += '}\n'
-            
+
         output_part = self.get_output_partition()
         if output_part is not None:
             res += '{\n'
             res += f'OutputPartition:\n  partition_id={self._output_partition_id}\n{output_part}'
             res += '}\n'
-            
+
         return res
-    
+
     def __repr__(self) -> str:
         return self.__str__()
-        
\ No newline at end of file
diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/legacy/pipeline/pipelinable.py
similarity index 93%
rename from colossalai/pipeline/pipelinable.py
rename to colossalai/legacy/pipeline/pipelinable.py
index ba8b1591d..e74cad0ad 100644
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/legacy/pipeline/pipelinable.py
@@ -1,20 +1,16 @@
-import inspect
-
 import torch
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.utils import CheckpointModule
 from colossalai.tensor import ColoParameter
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 
 from .layer_spec import LayerSpec
 from .utils import (
-    build_kwargs_for_function,
     build_kwargs_for_module,
     call_module,
     customized_partition,
-    exec_func_with_kwargs,
     exec_funcs_with_kwargs,
     partition_balanced,
     partition_uniform,
@@ -135,8 +131,10 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
             children_name = []
             for child in self._root_children:
                 layer_spec = self._layer_spec_dict[id(child)]
-                if layer_spec.typename in (torch.nn.modules.container.ModuleList,
-                                           torch.nn.modules.container.Sequential):
+                if layer_spec.typename in (
+                        torch.nn.modules.container.ModuleList,
+                        torch.nn.modules.container.Sequential,
+                ):
                     for child_in_container in layer_spec.children:
                         self._layer_spec_list.append(self._layer_spec_dict[id(child_in_container)])
                         for name, module in self._model.named_modules():
@@ -155,9 +153,11 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
             named_modules = dict(self._model.named_modules())
             for index, element in enumerate(exec_seq):
                 if isinstance(element, str):
-                    if element == 'SPLIT_NODE':
+                    if element == "SPLIT_NODE":
                         continue
-                    assert element in named_modules, f'Found invalid module name {element}, please check if you spell the module name correctly.'
+                    assert (
+                        element in named_modules
+                    ), f"Found invalid module name {element}, please check if you spell the module name correctly."
 
                     # get the layer spec based on the module ID
                     module = named_modules[element]
@@ -198,11 +198,12 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
                     param_counts.append(layer_spec.count_params())
                 parts = partition_balanced(param_counts, pipeline_size, num_chunks)[rank]
             elif self._policy == "customized":
-                assert self._exec_seq is not None, f'An explicit exec_seq must be defined by user in customized policy mode.'
+                assert (self._exec_seq
+                        is not None), f"An explicit exec_seq must be defined by user in customized policy mode."
                 self.customized_parts = customized_partition(self._exec_seq)
                 assert len(self.customized_parts) == gpc.get_world_size(
                     ParallelMode.PIPELINE
-                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
+                ), f"World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}"
                 parts = self.customized_parts[rank]
             else:
                 raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")
@@ -241,7 +242,6 @@ class PipelinableModel(torch.nn.Module):
 
     def forward(self, *input_tensor, **kwargs):
         for module in self._module_list:
-
             if id(module) in self._front_func_dict:
                 input_tensor = exec_funcs_with_kwargs(self._front_func_dict, id(module), input_tensor, kwargs)
 
diff --git a/colossalai/pipeline/pipeline_process_group.py b/colossalai/legacy/pipeline/pipeline_process_group.py
similarity index 98%
rename from colossalai/pipeline/pipeline_process_group.py
rename to colossalai/legacy/pipeline/pipeline_process_group.py
index c61d97eba..1168158de 100644
--- a/colossalai/pipeline/pipeline_process_group.py
+++ b/colossalai/legacy/pipeline/pipeline_process_group.py
@@ -1,11 +1,11 @@
-from typing import List, Dict, Tuple
 import os
 import threading
+from typing import Dict, List, Tuple
 
-from torch.distributed import rpc
 import torch.distributed as dist
+from torch.distributed import rpc
 
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
 
 
 class PipelineProcessGroup:
diff --git a/colossalai/legacy/pipeline/rpc/__init__.py b/colossalai/legacy/pipeline/rpc/__init__.py
new file mode 100644
index 000000000..15b65a413
--- /dev/null
+++ b/colossalai/legacy/pipeline/rpc/__init__.py
@@ -0,0 +1,4 @@
+from ._pipeline_schedule import ChimeraPipelineEngine, FillDrainPipelineEngine, OneFOneBPipelineEngine
+from .utils import pytree_map
+
+__all__ = ['FillDrainPipelineEngine', 'OneFOneBPipelineEngine', 'ChimeraPipelineEngine', 'pytree_map']
diff --git a/colossalai/pipeline/rpc/_pipeline_base.py b/colossalai/legacy/pipeline/rpc/_pipeline_base.py
similarity index 99%
rename from colossalai/pipeline/rpc/_pipeline_base.py
rename to colossalai/legacy/pipeline/rpc/_pipeline_base.py
index 9e549df58..88ddb9e98 100644
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/legacy/pipeline/rpc/_pipeline_base.py
@@ -12,9 +12,9 @@ from torch import autograd, nn, optim
 from torch._C._distributed_rpc import PyRRef
 from torch.futures import Future
 
-from colossalai.pipeline.middleware import Partition, PartitionInputVal, PartitionOutputVal, Topo
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc.utils import (
+from colossalai.legacy.pipeline.middleware import Partition, PartitionInputVal, PartitionOutputVal, Topo
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
+from colossalai.legacy.pipeline.rpc.utils import (
     get_batch_lengths,
     pyobj_map,
     pytree_filter,
diff --git a/colossalai/pipeline/rpc/_pipeline_schedule.py b/colossalai/legacy/pipeline/rpc/_pipeline_schedule.py
similarity index 97%
rename from colossalai/pipeline/rpc/_pipeline_schedule.py
rename to colossalai/legacy/pipeline/rpc/_pipeline_schedule.py
index 6eda8f3b3..f53a4835e 100644
--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/legacy/pipeline/rpc/_pipeline_schedule.py
@@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch._C._distributed_rpc import PyRRef
 from torch.futures import Future
 
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_base import Phase, PipelineEngineBase, UniqueKey, WorkerBase, WorkItem
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
+from colossalai.legacy.pipeline.rpc._pipeline_base import Phase, PipelineEngineBase, UniqueKey, WorkerBase, WorkItem
 
 # Implementation of different Pipeline schedule
 # <strategy>Worker defines the worker for each stage
@@ -78,7 +78,7 @@ class OneFOneBWorker(WorkerBase):
         # 1. forward times reach actual_stage_num, this is the end of continuous forward
         # 2. forward times reach num_microbatches, this is the end of 1F1B mode
         if not is_last_stage and \
-            target_key.phase == Phase.FORWARD:
+                target_key.phase == Phase.FORWARD:
             if target_key.microbatch_id == actual_stage_num - 1 and num_microbatches > 2:
                 # Why need num_microbatches > 2 ? Because there is no steady stage when num_microbatches <= 2
                 outstanding_min = actual_stage_num - pp_rank - 1
@@ -144,7 +144,7 @@ class ChimeraWorker(WorkerBase):
         forward_block_num = self.forward_times // forward_block_size
 
         if self.forward_times >= real_microbatch_num or \
-            ((pp_rank + 1) % stage_num == 0 and forward_block_num > self.backward_times):
+                ((pp_rank + 1) % stage_num == 0 and forward_block_num > self.backward_times):
             target_phase = Phase.BACKWARD
             target_microbatch_id = self.backward_times
         else:    # others
diff --git a/colossalai/pipeline/rpc/utils.py b/colossalai/legacy/pipeline/rpc/utils.py
similarity index 98%
rename from colossalai/pipeline/rpc/utils.py
rename to colossalai/legacy/pipeline/rpc/utils.py
index 06e6d976d..d1033fbde 100644
--- a/colossalai/pipeline/rpc/utils.py
+++ b/colossalai/legacy/pipeline/rpc/utils.py
@@ -10,7 +10,7 @@ from torch._C._distributed_rpc import _is_current_rpc_agent_set
 from torch.futures import Future
 
 from colossalai.initialize import launch
-from colossalai.pipeline.pipeline_process_group import ppg
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
 
 
 def pyobj_map(obj: Any, fn: Callable, process_types: Union[Type, Tuple[Type]] = ()) -> Any:
diff --git a/colossalai/pipeline/utils.py b/colossalai/legacy/pipeline/utils.py
similarity index 100%
rename from colossalai/pipeline/utils.py
rename to colossalai/legacy/pipeline/utils.py
diff --git a/colossalai/legacy/tensor/__init__.py b/colossalai/legacy/tensor/__init__.py
new file mode 100644
index 000000000..d3278bf1e
--- /dev/null
+++ b/colossalai/legacy/tensor/__init__.py
@@ -0,0 +1,17 @@
+from . import distspec
+from .compute_spec import ComputePattern, ComputeSpec
+from .dist_spec_mgr import DistSpecManager
+from .distspec import ReplicaSpec, ShardSpec
+from .process_group import ProcessGroup
+from .tensor_spec import ColoTensorSpec
+
+__all__ = [
+    'ComputePattern',
+    'ComputeSpec',
+    'distspec',
+    'DistSpecManager',
+    'ProcessGroup',
+    'ColoTensorSpec',
+    'ShardSpec',
+    'ReplicaSpec',
+]
diff --git a/colossalai/tensor/compute_spec.py b/colossalai/legacy/tensor/compute_spec.py
similarity index 100%
rename from colossalai/tensor/compute_spec.py
rename to colossalai/legacy/tensor/compute_spec.py
diff --git a/colossalai/tensor/const.py b/colossalai/legacy/tensor/const.py
similarity index 100%
rename from colossalai/tensor/const.py
rename to colossalai/legacy/tensor/const.py
diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/legacy/tensor/dist_spec_mgr.py
similarity index 97%
rename from colossalai/tensor/dist_spec_mgr.py
rename to colossalai/legacy/tensor/dist_spec_mgr.py
index 4740a316b..d97308b04 100644
--- a/colossalai/tensor/dist_spec_mgr.py
+++ b/colossalai/legacy/tensor/dist_spec_mgr.py
@@ -4,12 +4,12 @@ import torch
 import torch.distributed as dist
 from numpy import prod
 
-from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
-from colossalai.tensor.process_group import ProcessGroup
+from colossalai.legacy.tensor.distspec import DistPlacementPattern, _DistSpec
+from colossalai.legacy.tensor.process_group import ProcessGroup
 
 
 # TODO(jiaruifang) circle import, move the divide to colossalai.commons.
-# colossalai.tensor shall not import any submodule from colossal.nn
+# colossalai.legacy.tensor shall not import any submodule from colossal.nn
 def divide(numerator, denominator):
     """Only allow exact division.
 
diff --git a/colossalai/tensor/distspec.py b/colossalai/legacy/tensor/distspec.py
similarity index 100%
rename from colossalai/tensor/distspec.py
rename to colossalai/legacy/tensor/distspec.py
diff --git a/colossalai/tensor/op_wrapper.py b/colossalai/legacy/tensor/op_wrapper.py
similarity index 97%
rename from colossalai/tensor/op_wrapper.py
rename to colossalai/legacy/tensor/op_wrapper.py
index 1c00066f7..63ebaa264 100644
--- a/colossalai/tensor/op_wrapper.py
+++ b/colossalai/legacy/tensor/op_wrapper.py
@@ -1,8 +1,5 @@
-from typing import (
-    Callable,
-    Dict,
-)
 import functools
+from typing import Callable, Dict
 
 # Custom sharded ops
 _COLOSSAL_OPS: Dict[str, Callable] = {}
diff --git a/colossalai/tensor/process_group.py b/colossalai/legacy/tensor/process_group.py
similarity index 100%
rename from colossalai/tensor/process_group.py
rename to colossalai/legacy/tensor/process_group.py
diff --git a/colossalai/tensor/tensor_spec.py b/colossalai/legacy/tensor/tensor_spec.py
similarity index 79%
rename from colossalai/tensor/tensor_spec.py
rename to colossalai/legacy/tensor/tensor_spec.py
index 580df9f8f..aa792e507 100644
--- a/colossalai/tensor/tensor_spec.py
+++ b/colossalai/legacy/tensor/tensor_spec.py
@@ -1,8 +1,8 @@
 from dataclasses import dataclass
 from typing import Optional
 
-from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec
-from colossalai.tensor.process_group import ProcessGroup
+from colossalai.legacy.tensor.distspec import DistPlacementPattern, _DistSpec
+from colossalai.legacy.tensor.process_group import ProcessGroup
 
 from .compute_spec import ComputeSpec
 
diff --git a/colossalai/legacy/trainer/_trainer.py b/colossalai/legacy/trainer/_trainer.py
index 1847e5622..1cb99fcc9 100644
--- a/colossalai/legacy/trainer/_trainer.py
+++ b/colossalai/legacy/trainer/_trainer.py
@@ -6,8 +6,9 @@ from tqdm import tqdm
 
 from colossalai.legacy.engine import Engine
 from colossalai.legacy.trainer.hooks import BaseHook
+from colossalai.legacy.utils import is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
 from colossalai.logging import DistributedLogger
-from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
+from colossalai.utils import MultiTimer
 
 
 class Trainer:
diff --git a/colossalai/legacy/trainer/hooks/_checkpoint_hook.py b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
index 6b150d291..cda10030b 100644
--- a/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/legacy/trainer/hooks/_checkpoint_hook.py
@@ -4,8 +4,8 @@ import torch
 
 from colossalai.legacy.registry import HOOKS
 from colossalai.legacy.trainer.hooks import BaseHook
+from colossalai.legacy.utils.checkpointing import save_checkpoint
 from colossalai.logging import get_dist_logger
-from colossalai.utils.checkpointing import save_checkpoint
 
 from ._lr_scheduler_hook import LRSchedulerHook
 
diff --git a/colossalai/legacy/trainer/hooks/_log_hook.py b/colossalai/legacy/trainer/hooks/_log_hook.py
index 7d9ad19aa..b1a398ce7 100644
--- a/colossalai/legacy/trainer/hooks/_log_hook.py
+++ b/colossalai/legacy/trainer/hooks/_log_hook.py
@@ -5,12 +5,13 @@ import os
 import os.path as osp
 from typing import List
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import HOOKS
 from colossalai.legacy.trainer.hooks._metric_hook import ThroughputMetric
+from colossalai.legacy.utils import is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
 from colossalai.logging import DistributedLogger
-from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0, report_memory_usage
+from colossalai.utils import MultiTimer
 
 from ._base_hook import BaseHook
 from ._commons_ import _format_number
@@ -112,8 +113,8 @@ class TensorboardHook(BaseHook):
     Args:
         log_dir (str): Directory of log.
         ranks (list): Ranks of processors.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
-            defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): Parallel mode used in trainer,
+            defaults to colossalai.legacy.context.parallel_mode.ParallelMode.GLOBAL.
         priority (int, optional): Priority in the printing, hooks with small priority will be printed in front,
             defaults to 10. If different hooks share same priority, the order of printing would
             depend on the hooks order in the hook list.
diff --git a/colossalai/legacy/trainer/hooks/_metric_hook.py b/colossalai/legacy/trainer/hooks/_metric_hook.py
index f1bd19387..899e4d08a 100644
--- a/colossalai/legacy/trainer/hooks/_metric_hook.py
+++ b/colossalai/legacy/trainer/hooks/_metric_hook.py
@@ -7,11 +7,12 @@ from typing import Callable
 import torch
 import torch.distributed as dist
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import all_reduce
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import HOOKS
-from colossalai.utils import get_current_device, is_no_pp_or_last_stage
+from colossalai.legacy.utils import is_no_pp_or_last_stage
+from colossalai.utils import get_current_device
 
 from ._base_hook import BaseHook
 from ._commons_ import _format_number
diff --git a/colossalai/legacy/utils/__init__.py b/colossalai/legacy/utils/__init__.py
new file mode 100644
index 000000000..ae358f8be
--- /dev/null
+++ b/colossalai/legacy/utils/__init__.py
@@ -0,0 +1,53 @@
+from .checkpointing import load_checkpoint, save_checkpoint
+from .common import (
+    clip_grad_norm_fp32,
+    copy_tensor_parallel_attributes,
+    count_zeros_fp32,
+    is_dp_rank_0,
+    is_model_parallel_parameter,
+    is_no_pp_or_last_stage,
+    is_tp_rank_0,
+    is_using_ddp,
+    is_using_pp,
+    is_using_sequence,
+    param_is_not_tensor_parallel_duplicate,
+    print_rank_0,
+    switch_virtual_pipeline_parallel_rank,
+    sync_model_param,
+)
+from .data_sampler import DataParallelSampler, get_dataloader
+from .memory import (
+    colo_device_memory_capacity,
+    colo_device_memory_used,
+    colo_get_cpu_memory_capacity,
+    colo_set_cpu_memory_capacity,
+    colo_set_process_memory_fraction,
+    report_memory_usage,
+)
+
+__all__ = [
+    'DataParallelSampler',
+    'get_dataloader',
+    'save_checkpoint',
+    'load_checkpoint',
+    'colo_device_memory_capacity',
+    'colo_device_memory_used',
+    'colo_get_cpu_memory_capacity',
+    'colo_set_cpu_memory_capacity',
+    'colo_set_process_memory_fraction',
+    'report_memory_usage',
+    'clip_grad_norm_fp32',
+    'copy_tensor_parallel_attributes',
+    'count_zeros_fp32',
+    'is_dp_rank_0',
+    'is_model_parallel_parameter',
+    'is_no_pp_or_last_stage',
+    'is_tp_rank_0',
+    'is_using_ddp',
+    'is_using_pp',
+    'is_using_sequence',
+    'param_is_not_tensor_parallel_duplicate',
+    'print_rank_0',
+    'switch_virtual_pipeline_parallel_rank',
+    'sync_model_param',
+]
diff --git a/colossalai/utils/activation_checkpoint.py b/colossalai/legacy/utils/activation_checkpoint.py
similarity index 95%
rename from colossalai/utils/activation_checkpoint.py
rename to colossalai/legacy/utils/activation_checkpoint.py
index fa9ed827a..add690f28 100644
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/legacy/utils/activation_checkpoint.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+import weakref
+
 import torch
 from torch.utils.checkpoint import check_backward_validity, detach_variable
 
-from colossalai.context.random import get_states, get_current_mode, set_seed_states, set_mode, sync_states
-from .cuda import get_current_device
-
-import weakref
+from colossalai.legacy.context.random import get_current_mode, get_states, set_mode, set_seed_states, sync_states
+from colossalai.utils import get_current_device
 
 
 def copy_to_device(obj, device):
@@ -143,7 +143,7 @@ def checkpoint(function, activation_offload, *args, use_reentrant: bool = True):
 
     Args:
         function: Describe the forward pass function. It should know how to handle the input tuples.
-        activation_offload: The variable to check whether we should offload activation to cpu 
+        activation_offload: The variable to check whether we should offload activation to cpu
         args (list): Tuple containing the parameters of the function
         use_reentrant: Bool type to check if we need to use_reentrant, if use_reentrant=False, there
         might be more flexibility for user to define there checkpoint function
@@ -227,12 +227,12 @@ def _checkpoint_without_reentrant(function, activation_offload=False, *args):
             # rerun forward, the inner_pack will store all the activations in storage
             if has_autocast_in_fwd:
                 with torch.enable_grad(), \
-                     torch.cuda.amp.autocast(), \
-                     torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
+                        torch.cuda.amp.autocast(), \
+                        torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
                     _unused = function(*args)
             else:
                 with torch.enable_grad(), \
-                     torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
+                        torch.autograd.graph.saved_tensors_hooks(inner_pack, inner_unpack):
                     _unused = function(*args)
 
         if x not in storage:
diff --git a/colossalai/legacy/utils/checkpoint/__init__.py b/colossalai/legacy/utils/checkpoint/__init__.py
new file mode 100644
index 000000000..558a956b3
--- /dev/null
+++ b/colossalai/legacy/utils/checkpoint/__init__.py
@@ -0,0 +1,3 @@
+from .module_checkpoint import load_checkpoint, save_checkpoint
+
+__all__ = ['save_checkpoint', 'load_checkpoint']
diff --git a/colossalai/utils/checkpoint/module_checkpoint.py b/colossalai/legacy/utils/checkpoint/module_checkpoint.py
similarity index 90%
rename from colossalai/utils/checkpoint/module_checkpoint.py
rename to colossalai/legacy/utils/checkpoint/module_checkpoint.py
index d390da864..9bd2907ab 100644
--- a/colossalai/utils/checkpoint/module_checkpoint.py
+++ b/colossalai/legacy/utils/checkpoint/module_checkpoint.py
@@ -1,25 +1,28 @@
+from typing import Dict, Optional
+
 import torch
 import torch.distributed as dist
+
+from colossalai.interface import OptimizerWrapper
 from colossalai.tensor import ColoTensor
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils.checkpoint.utils import gather_tensor, scatter_tensor
-from typing import Optional, Dict
+
+from .utils import gather_tensor, scatter_tensor
 
 
 def save_checkpoint(path: str,
                     epoch: int,
                     model: torch.nn.Module,
-                    optimizer: Optional[ColossalaiOptimizer] = None,
+                    optimizer: Optional[OptimizerWrapper] = None,
                     lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                     *args,
                     **kwargs):
-    """save_checkpoint 
+    """save_checkpoint
     save a model, whose parameters are `ColoTensor`s.
     Args:
         path (str): directory to save the checkpoint files.
         epoch (int): the number of epoch
         model (torch.nn.Module): a torch module initialized by ColoInitContext
-        optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None.
+        optimizer (OptimizerWrapper, optional): optimizers. Defaults to None.
         lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None.
     """
     rank = dist.get_rank()
@@ -74,17 +77,17 @@ def save_checkpoint(path: str,
 def load_checkpoint(path: str,
                     epoch: int,
                     model: torch.nn.Module,
-                    optimizer: Optional[ColossalaiOptimizer] = None,
+                    optimizer: Optional[OptimizerWrapper] = None,
                     lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                     torch_load_kwargs: Optional[Dict] = None,
                     load_state_dict_kwargs: Optional[Dict] = None):
-    """load_checkpoint 
+    """load_checkpoint
     load a model, whose parameters are `ColoTensor`s.
     Args:
         path (str): directory to save the checkpoint files.
         epoch (int): the number of epoch
         model (torch.nn.Module): a torch module initialized by ColoInitContext
-        optimizer (ColossalaiOptimizer, optional): optimizers. Defaults to None.
+        optimizer (OptimizerWrapper, optional): optimizers. Defaults to None.
         lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): lr schedule. Defaults to None.
         torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function
         load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function
diff --git a/colossalai/utils/checkpoint/utils.py b/colossalai/legacy/utils/checkpoint/utils.py
similarity index 91%
rename from colossalai/utils/checkpoint/utils.py
rename to colossalai/legacy/utils/checkpoint/utils.py
index 682cd0903..c830d4811 100644
--- a/colossalai/utils/checkpoint/utils.py
+++ b/colossalai/legacy/utils/checkpoint/utils.py
@@ -1,63 +1,65 @@
-import torch
-import torch.distributed as dist
-from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.tensor.distspec import _DistSpec, DistPlacementPattern
-
-
-def robust_broadcast(tensor):
-    with torch.no_grad():
-        is_cpu_ten = tensor.device.type == 'cpu'
-        if is_cpu_ten:
-            b_data = tensor.cuda()
-        else:
-            b_data = tensor
-
-        dist.broadcast(b_data, 0)
-
-        if is_cpu_ten:
-            tensor.copy_(b_data)
-
-
-def gather_tensor(colo_tensor: ColoTensor) -> None:
-    """Make colo_tensor replicated when the rank is 0
-    """
-    if not colo_tensor.is_replicate():
-        pg = colo_tensor.get_process_group()
-        # for the group which contains rank 0
-        if pg.dp_local_rank() == 0:
-            old_dist_spec = colo_tensor.dist_spec
-            colo_tensor.to_replicate_()
-            if dist.get_rank() != 0:
-                colo_tensor.set_dist_spec(old_dist_spec)
-
-        # synchronize all processes for unexpected problems
-        dist.barrier()
-
-    if dist.get_rank() == 0:
-        setattr(colo_tensor, 'save_ready', True)    # set saving signature
-
-
-def scatter_tensor(colo_tensor: ColoTensor, dist_spec: _DistSpec) -> None:
-    """Reversal operation of `gather_tensor`.
-    """
-    if dist_spec.placement == DistPlacementPattern.REPLICATE:
-        robust_broadcast(colo_tensor.data)
-    else:
-        global_size = colo_tensor.size_global()
-
-        if dist.get_rank() == 0:
-            entire_data = colo_tensor.data
-        else:
-            entire_data = torch.empty(global_size, device=colo_tensor.device)
-        robust_broadcast(entire_data)
-
-        if dist.get_rank() == 0:
-            colo_tensor.set_dist_spec(dist_spec)
-        else:
-            rep_tensor = ColoTensor(
-                entire_data, ColoTensorSpec(pg=colo_tensor.get_process_group(), compute_attr=colo_tensor.compute_spec))
-            rep_tensor.set_dist_spec(dist_spec)
-            with torch.no_grad():
-                colo_tensor.data.copy_(rep_tensor.data)
-        # synchronize all processes for unexpected problems
-        dist.barrier()
+import torch
+import torch.distributed as dist
+
+from colossalai.legacy.tensor import ColoTensorSpec
+from colossalai.legacy.tensor.distspec import DistPlacementPattern, _DistSpec
+from colossalai.tensor import ColoTensor
+
+
+def robust_broadcast(tensor):
+    with torch.no_grad():
+        is_cpu_ten = tensor.device.type == 'cpu'
+        if is_cpu_ten:
+            b_data = tensor.cuda()
+        else:
+            b_data = tensor
+
+        dist.broadcast(b_data, 0)
+
+        if is_cpu_ten:
+            tensor.copy_(b_data)
+
+
+def gather_tensor(colo_tensor: ColoTensor) -> None:
+    """Make colo_tensor replicated when the rank is 0
+    """
+    if not colo_tensor.is_replicate():
+        pg = colo_tensor.get_process_group()
+        # for the group which contains rank 0
+        if pg.dp_local_rank() == 0:
+            old_dist_spec = colo_tensor.dist_spec
+            colo_tensor.to_replicate_()
+            if dist.get_rank() != 0:
+                colo_tensor.set_dist_spec(old_dist_spec)
+
+        # synchronize all processes for unexpected problems
+        dist.barrier()
+
+    if dist.get_rank() == 0:
+        setattr(colo_tensor, 'save_ready', True)    # set saving signature
+
+
+def scatter_tensor(colo_tensor: ColoTensor, dist_spec: _DistSpec) -> None:
+    """Reversal operation of `gather_tensor`.
+    """
+    if dist_spec.placement == DistPlacementPattern.REPLICATE:
+        robust_broadcast(colo_tensor.data)
+    else:
+        global_size = colo_tensor.size_global()
+
+        if dist.get_rank() == 0:
+            entire_data = colo_tensor.data
+        else:
+            entire_data = torch.empty(global_size, device=colo_tensor.device)
+        robust_broadcast(entire_data)
+
+        if dist.get_rank() == 0:
+            colo_tensor.set_dist_spec(dist_spec)
+        else:
+            rep_tensor = ColoTensor(
+                entire_data, ColoTensorSpec(pg=colo_tensor.get_process_group(), compute_attr=colo_tensor.compute_spec))
+            rep_tensor.set_dist_spec(dist_spec)
+            with torch.no_grad():
+                colo_tensor.data.copy_(rep_tensor.data)
+        # synchronize all processes for unexpected problems
+        dist.barrier()
diff --git a/colossalai/utils/checkpointing.py b/colossalai/legacy/utils/checkpointing.py
similarity index 98%
rename from colossalai/utils/checkpointing.py
rename to colossalai/legacy/utils/checkpointing.py
index d1c6b6370..b7b29cc98 100644
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/legacy/utils/checkpointing.py
@@ -3,9 +3,11 @@ from itertools import chain
 
 import torch
 import torch.distributed as dist
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.constants import IS_TENSOR_PARALLEL
+
+from colossalai.legacy.constants import IS_TENSOR_PARALLEL
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+
 try:
     from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX
 except ImportError:
diff --git a/colossalai/legacy/utils/common.py b/colossalai/legacy/utils/common.py
new file mode 100644
index 000000000..35095161c
--- /dev/null
+++ b/colossalai/legacy/utils/common.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from collections import defaultdict
+from contextlib import contextmanager
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch import inf
+from torch.nn.parameter import Parameter
+
+from colossalai.legacy.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
+from colossalai.legacy.tensor import ProcessGroup
+from colossalai.tensor import ColoParameter
+from colossalai.utils.multi_tensor_apply import multi_tensor_applier
+
+try:
+    from colossalai._C import fused_optim
+except:
+    fused_optim = None
+
+
+def print_rank_0(msg: str, logger=None):
+    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
+
+    Args:
+        msg (str): A string message to output.
+        logger (:class:`colossalai.logging.DistributedLogger`, optional):
+            The logger to record the message, defaults to None.
+    """
+    if gpc.get_global_rank() == 0:
+        if logger is None:
+            print(msg, flush=True)
+        else:
+            logger.info(msg)
+
+
+def sync_model_param(model, parallel_mode):
+    r"""Make sure data parameters are consistent during Data Parallel Mode.
+
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel mode to be checked.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
+    """
+    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
+        for param in model.parameters():
+            ranks = gpc.get_ranks_in_group(parallel_mode)
+            dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
+
+
+def is_dp_rank_0():
+    return not gpc.is_initialized(ParallelMode.DATA) or gpc.is_first_rank(ParallelMode.DATA)
+
+
+def is_tp_rank_0():
+    return not gpc.is_initialized(ParallelMode.TENSOR) or gpc.is_first_rank(ParallelMode.TENSOR)
+
+
+def is_no_pp_or_last_stage():
+    return not gpc.is_initialized(ParallelMode.PIPELINE) or gpc.is_last_rank(ParallelMode.PIPELINE)
+
+
+def is_using_ddp():
+    return gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1
+
+
+def is_using_pp():
+    return gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1
+
+
+def is_using_sequence():
+    return gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1
+
+
+class model_branch_context(object):
+
+    def __enter__(self):
+        self.env_status = env.save()
+
+    def __exit__(self, *exc_info):
+        env.load(**self.env_status)
+
+
+def is_model_parallel_parameter(p):
+    return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
+
+
+def _calc_l2_norm(grads):
+    # we should not
+    global fused_optim
+
+    if fused_optim is None:
+        from colossalai.kernel.op_builder import FusedOptimBuilder
+        fused_optim = FusedOptimBuilder().load()
+
+    norm = 0.0
+    if len(grads) > 0:
+        dummy_overflow_buf = torch.cuda.IntTensor([0])
+        norm, _ = multi_tensor_applier(
+            fused_optim.multi_tensor_l2norm,
+            dummy_overflow_buf,
+            [grads],
+            False    # no per-parameter norm
+        )
+    return norm
+
+
+def _calc_lp(grads, norm_type):
+    norm = 0.0
+    for grad in grads:
+        grad_norm = torch.norm(grad, norm_type)
+        norm += grad_norm**norm_type
+    return norm
+
+
+def _move_norm_to_cuda(norm: Union[float, torch.Tensor]) -> Union[float, torch.Tensor]:
+    if torch.is_tensor(norm) and norm.device.type != 'cuda':
+        norm = norm.to(torch.cuda.current_device())
+    return norm
+
+
+def _get_tensor_norm(norm: Union[float, torch.Tensor], move_to_cuda) -> torch.Tensor:
+    if isinstance(norm, float):
+        norm = torch.Tensor([norm])
+    if move_to_cuda:
+        norm = norm.to(torch.cuda.current_device())
+    return norm
+
+
+# ======== Gradient Clipping =========
+
+
+def _compute_local_lp(params: List[ColoParameter], norm_type: float) -> float:
+    if len(params) == 0:
+        return 0.0
+    grads = [p.grad for p in params]
+    use_cuda_kernel = grads[0].device.type == 'cuda'
+    if norm_type == inf:
+        local_lp = max([g.abs().max() for g in grads])
+    elif norm_type == 2.0 and use_cuda_kernel:
+        local_lp = _calc_l2_norm(grads)**norm_type
+    else:
+        local_lp = _calc_lp(grads, norm_type)
+    if isinstance(local_lp, torch.Tensor):
+        return local_lp.item()
+    return local_lp
+
+
+def _compute_buckets_lp(params: List[ColoParameter], norm_type: float) -> float:
+    if len(params) == 0:
+        return 0.0
+    buckets: Dict[Optional[ProcessGroup], List[ColoParameter]] = defaultdict(list)
+    for p in params:
+        if p.is_replicate():
+            buckets[None].append(p)
+        else:
+            buckets[p.get_process_group().tp_process_group()].append(p)
+    total_lp = 0.0
+    for group, bucket in buckets.items():
+        local_lp = _compute_local_lp(bucket, norm_type)
+        if group is not None:
+            local_lp_tensor = torch.tensor([local_lp], device=torch.cuda.current_device())
+            if norm_type == inf:
+                dist.all_reduce(local_lp_tensor, op=dist.ReduceOp.MAX, group=group)
+            else:
+                dist.all_reduce(local_lp_tensor, group=group)
+            local_lp = local_lp_tensor.item()
+        if norm_type == inf:
+            total_lp = max(total_lp, local_lp)
+        else:
+            total_lp += local_lp
+    return total_lp
+
+
+def _compute_pp_grad_lp(total_lp: float, norm_type: float) -> float:
+    if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
+        total_lp_tensor = torch.tensor([total_lp], device=torch.cuda.current_device())
+        if norm_type == inf:
+            dist.all_reduce(total_lp_tensor, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.PIPELINE))
+        else:
+            dist.all_reduce(total_lp_tensor, group=gpc.get_group(ParallelMode.PIPELINE))
+        total_lp = total_lp_tensor.item()
+    return total_lp
+
+
+def _compute_grad_lp(parameters, norm_type: float = 2.0) -> float:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    grad_dtype = None
+    cpu_grad_params: List[ColoParameter] = []
+    cuda_grad_params: List[ColoParameter] = []
+    for p in parameters:
+        if p.grad is None:
+            continue
+        assert isinstance(p, ColoParameter)
+        if grad_dtype is None:
+            grad_dtype = p.grad.dtype
+        assert p.grad.dtype == grad_dtype, f'Expected all grads are {grad_dtype}, got {p.grad.dtype}'
+        if p.grad.device.type == 'cuda':
+            cuda_grad_params.append(p)
+        else:
+            cpu_grad_params.append(p)
+    norm_type = float(norm_type)
+    cpu_lp = _compute_buckets_lp(cpu_grad_params, norm_type)
+    cuda_lp = _compute_buckets_lp(cuda_grad_params, norm_type)
+    if norm_type == inf:
+        total_lp = max(cpu_lp, cuda_lp)
+    else:
+        total_lp = cpu_lp + cuda_lp
+    return _compute_pp_grad_lp(total_lp, norm_type)
+
+
+def compute_grad_norm(parameters, norm_type: float = 2.0) -> float:
+    norm_type = float(norm_type)
+    total_norm = _compute_grad_lp(parameters, norm_type)
+    if norm_type != inf:
+        total_norm = total_norm**(1 / norm_type)
+    return total_norm
+
+
+def _clip_grad_norm(parameters, max_norm: float, total_norm: float) -> None:
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1.0:
+        cuda_grads: List[torch.Tensor] = []
+        cpu_grads: List[torch.Tensor] = []
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        for p in parameters:
+            if p.grad is None:
+                continue
+            if p.grad.device.type == 'cuda':
+                cuda_grads.append(p.grad.detach())
+            else:
+                cpu_grads.append(p.grad.detach())
+        if len(cuda_grads) > 0:
+            dummy_overflow_buf = torch.cuda.IntTensor([0])
+            multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [cuda_grads, cuda_grads],
+                                 clip_coef)
+        for g in cpu_grads:
+            g.mul_(clip_coef)
+
+
+def clip_grad_norm(parameters, max_norm: float, norm_type: float = 2.0) -> float:
+    total_norm = compute_grad_norm(parameters, norm_type)
+    _clip_grad_norm(parameters, max_norm, total_norm)
+    return total_norm
+
+
+def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.
+
+    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
+    added functionality to handle model parallel parameters.
+
+    Note:
+        the gradients are modified in place.
+
+    Args:
+        parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
+            An iterable of Tensors or a single Tensor that will have gradients normalized.
+        max_norm (Union[float, int]): Max norm of the gradients.
+        norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
+
+    Returns:
+        float: Total norm of the parameters.
+    """
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    # Filter parameters based on:
+    #   - grad should not be none
+    #   - parameter should not be shared
+    #   - should not be a replica due to tensor model parallelism
+    params: List[Parameter] = []
+    has_zero_shared_param: bool = False
+    for param in parameters:
+        if param.grad is not None:
+            # Make sure the grads are in fp32
+            assert param.grad.dtype == torch.float, \
+                f'expected gradient to be dtype torch.float, but got {param.grad.type()}'
+            if hasattr(param, 'colo_attr') and param.colo_attr.sharded_data_tensor.is_sharded:
+                has_zero_shared_param = True
+            params.append(param)
+
+    if len(params) == 0:
+        enable_cuda_kernels = False
+    else:
+        enable_cuda_kernels = params[0].grad.device.type == 'cuda'
+    # Norm parameters.
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+
+    # Parameters can be on CPU or CUDA
+    # If parameters are on CPU, disable CUDA kernels
+
+    # Calculate norm.
+    if norm_type == inf:
+        total_norm = max(p.grad.data.abs().max() for p in params)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all model-parallel GPUs.
+        if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=gpc.get_group(ParallelMode.MODEL),
+                            async_op=False)
+        if has_zero_shared_param:
+            dist.all_reduce(total_norm_cuda,
+                            op=dist.ReduceOp.MAX,
+                            group=gpc.get_group(ParallelMode.DATA),
+                            async_op=False)
+        total_norm = total_norm_cuda[0].item()
+    else:
+        tensor_parallel_grads = []
+        no_tensor_parallel_grads = []
+        zero_sharded_grads = []
+        for p in params:
+            if is_model_parallel_parameter(p):
+                reductor = (gpc.get_world_size(ParallelMode.TENSOR) / getattr(p, NUM_PARTITIONS))**(1 / norm_type)
+                tensor_parallel_grads.append(p.grad.data / reductor)
+            elif hasattr(p, 'colo_attr') and p.colo_attr.sharded_data_tensor.is_sharded:
+                zero_sharded_grads.append(p.grad.data)
+            else:
+                no_tensor_parallel_grads.append(p.grad.data)
+
+        if norm_type == 2.0 and enable_cuda_kernels:
+            tensor_parallel_norm = _calc_l2_norm(tensor_parallel_grads)**norm_type
+            no_tensor_parallel_norm = _calc_l2_norm(no_tensor_parallel_grads)**norm_type
+            zero_sharded_norm = _calc_l2_norm(zero_sharded_grads)**norm_type
+        else:
+            tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type)
+            no_tensor_parallel_norm = _calc_lp(no_tensor_parallel_grads, norm_type)
+            zero_sharded_norm = _calc_lp(zero_sharded_grads, norm_type)
+        # If norm is type of float, then we convert them into torch.Tensor.
+        tensor_parallel_norm = _get_tensor_norm(tensor_parallel_norm, enable_cuda_kernels)
+        no_tensor_parallel_norm = _get_tensor_norm(no_tensor_parallel_norm, enable_cuda_kernels)
+        zero_sharded_norm = _get_tensor_norm(zero_sharded_norm, enable_cuda_kernels)
+        # If grads are on CPU, the norms is also on CPU. Cast them to CUDA tensors
+        if not enable_cuda_kernels:
+            tensor_parallel_norm = _move_norm_to_cuda(tensor_parallel_norm)
+            no_tensor_parallel_norm = _move_norm_to_cuda(no_tensor_parallel_norm)
+            zero_sharded_norm = _move_norm_to_cuda(zero_sharded_norm)
+
+        # Sum across all model-parallel GPUs.
+        if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0:
+            dist.all_reduce(tensor_parallel_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
+        # Sum across all zero sharded GPUs
+        if len(zero_sharded_grads) > 0:
+            dist.all_reduce(zero_sharded_norm, group=gpc.get_group(ParallelMode.DATA))
+            no_tensor_parallel_norm += zero_sharded_norm
+        total_norm = tensor_parallel_norm + no_tensor_parallel_norm
+        if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
+        total_norm = total_norm**(1.0 / norm_type)
+        if torch.is_tensor(total_norm):
+            total_norm = total_norm.item()
+
+    # Scale.
+    clip_coeff = max_norm / (total_norm + 1.0e-6)
+    if clip_coeff < 1.0:
+        if enable_cuda_kernels:
+            grads = [p.grad.detach() for p in params]
+            dummy_overflow_buf = torch.cuda.IntTensor([0])
+            multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff)
+        else:
+            for p in params:
+                p.grad.detach().mul_(clip_coeff)
+    return total_norm
+
+
+def count_zeros_fp32(parameters):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    # Filter parameters based on:
+    #   - grad should not be none
+    #   - parameter should not be shared
+    #   - should not be a replica due to tensor model parallelism
+    total_num_zeros = 0.0
+    for param in parameters:
+        grad_not_none = param.grad is not None
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        if grad_not_none and is_not_tp_duplicate:
+            grad = param.grad.detach()
+            num_zeros = grad.numel() - torch.count_nonzero(grad)
+            total_num_zeros = num_zeros + total_num_zeros
+
+    total_num_zeros = torch.IntTensor([int(total_num_zeros)]).cuda()
+
+    # Sum across all model-parallel GPUs.
+    ops = []
+    ops.append(
+        dist.all_reduce(total_num_zeros, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR), async_op=True))
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        ops.append(
+            dist.all_reduce(total_num_zeros,
+                            op=dist.ReduceOp.SUM,
+                            group=gpc.get_group(ParallelMode.PIPELINE),
+                            async_op=True))
+
+    for req in ops:
+        req.wait()
+    total_num_zeros = total_num_zeros.item()
+
+    return total_num_zeros
+
+
+def copy_tensor_parallel_attributes(src_tensor, dst_tensor):
+    for attr in TENSOR_PARALLEL_ATTRIBUTES:
+        if hasattr(src_tensor, attr):
+            val = getattr(src_tensor, attr)
+            setattr(dst_tensor, attr, val)
+
+
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, IS_TENSOR_PARALLEL) and getattr(param, IS_TENSOR_PARALLEL)) or (gpc.get_local_rank(
+        ParallelMode.TENSOR) == 0)
+
+
+@contextmanager
+def switch_virtual_pipeline_parallel_rank(rank):
+    prev_rank = gpc.virtual_pipeline_parallel_rank
+    try:
+        gpc.set_virtual_pipeline_parallel_rank(rank)
+        yield
+    finally:
+        gpc.set_virtual_pipeline_parallel_rank(prev_rank)
diff --git a/colossalai/utils/data_sampler/__init__.py b/colossalai/legacy/utils/data_sampler/__init__.py
similarity index 100%
rename from colossalai/utils/data_sampler/__init__.py
rename to colossalai/legacy/utils/data_sampler/__init__.py
diff --git a/colossalai/utils/data_sampler/base_sampler.py b/colossalai/legacy/utils/data_sampler/base_sampler.py
similarity index 100%
rename from colossalai/utils/data_sampler/base_sampler.py
rename to colossalai/legacy/utils/data_sampler/base_sampler.py
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/legacy/utils/data_sampler/data_parallel_sampler.py
similarity index 98%
rename from colossalai/utils/data_sampler/data_parallel_sampler.py
rename to colossalai/legacy/utils/data_sampler/data_parallel_sampler.py
index 881ddde78..66a5fdd36 100644
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/legacy/utils/data_sampler/data_parallel_sampler.py
@@ -10,8 +10,8 @@ import numpy as np
 import torch
 from torch.utils.data import DataLoader, Dataset, Sampler
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 T_co = TypeVar('T_co', covariant=True)
 
diff --git a/colossalai/utils/memory.py b/colossalai/legacy/utils/memory.py
similarity index 95%
rename from colossalai/utils/memory.py
rename to colossalai/legacy/utils/memory.py
index 434e90edd..360bf0da4 100644
--- a/colossalai/utils/memory.py
+++ b/colossalai/legacy/utils/memory.py
@@ -1,15 +1,15 @@
-import torch
 import gc
-import psutil
 from collections import namedtuple
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.utils import get_current_device
-from colossalai.core import global_context as gpc
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.logging import get_dist_logger
+import psutil
+import torch
+import torch.distributed as dist
 from packaging import version
 
+from colossalai.legacy.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_current_device
+
 _GLOBAL_CUDA_MEM_FRACTION = 1.0
 _GLOBAL_CPU_MEM_CAPACITY = -1
 
@@ -68,7 +68,7 @@ def report_memory_usage(message, logger=None, report_cpu=False):
     Raises:
         EnvironmentError: Raise error if no distributed environment has been initialized.
     """
-    if not gpc.is_initialized(ParallelMode.GLOBAL):
+    if not dist.is_initialized():
         raise EnvironmentError("No distributed environment is initialized")
 
     gpu_allocated = _bytes_to_MB(torch.cuda.memory_allocated())
@@ -138,7 +138,7 @@ def colo_device_memory_used(device: torch.device) -> int:
 
 
 def colo_set_process_memory_fraction(ratio: float) -> None:
-    """colo_set_process_memory_fraction 
+    """colo_set_process_memory_fraction
 
     set how much cuda memory used on the gpu belonging to the current process.
 
diff --git a/colossalai/utils/profiler/__init__.py b/colossalai/legacy/utils/profiler/__init__.py
similarity index 100%
rename from colossalai/utils/profiler/__init__.py
rename to colossalai/legacy/utils/profiler/__init__.py
diff --git a/colossalai/utils/profiler/extention.py b/colossalai/legacy/utils/profiler/extention.py
similarity index 100%
rename from colossalai/utils/profiler/extention.py
rename to colossalai/legacy/utils/profiler/extention.py
diff --git a/colossalai/utils/profiler/legacy/__init__.py b/colossalai/legacy/utils/profiler/legacy/__init__.py
similarity index 77%
rename from colossalai/utils/profiler/legacy/__init__.py
rename to colossalai/legacy/utils/profiler/legacy/__init__.py
index 849c7fca3..88beed86d 100644
--- a/colossalai/utils/profiler/legacy/__init__.py
+++ b/colossalai/legacy/utils/profiler/legacy/__init__.py
@@ -1,6 +1,6 @@
-from .comm_profiler import CommProfiler
-from .pcie_profiler import PcieProfiler
-from .prof_utils import ProfilerContext, BaseProfiler
-from .mem_profiler import MemProfiler
-
-__all__ = ['BaseProfiler', 'CommProfiler', 'PcieProfiler', 'MemProfiler', 'ProfilerContext']
+from .comm_profiler import CommProfiler
+from .mem_profiler import MemProfiler
+from .pcie_profiler import PcieProfiler
+from .prof_utils import BaseProfiler, ProfilerContext
+
+__all__ = ['BaseProfiler', 'CommProfiler', 'PcieProfiler', 'MemProfiler', 'ProfilerContext']
diff --git a/colossalai/utils/profiler/legacy/comm_profiler.py b/colossalai/legacy/utils/profiler/legacy/comm_profiler.py
similarity index 96%
rename from colossalai/utils/profiler/legacy/comm_profiler.py
rename to colossalai/legacy/utils/profiler/legacy/comm_profiler.py
index 334f0113e..bb7e2654c 100644
--- a/colossalai/utils/profiler/legacy/comm_profiler.py
+++ b/colossalai/legacy/utils/profiler/legacy/comm_profiler.py
@@ -1,308 +1,311 @@
-import inspect
-from pathlib import Path
-from functools import partial
-import torch
-from torch.autograd.profiler import profile
-import torch.distributed as dist
-from torch.distributed import ReduceOp
-from colossalai.utils import get_current_device
-from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
-from typing import List, Optional
-
-
-def _get_code_location(depth: int):
-    ret = []
-    length = min(len(inspect.stack()), depth + 1)
-    for i in range(3, length):
-        upper_frame = inspect.stack()[i]
-        function_name = inspect.stack()[i - 1].function
-        ret.append(upper_frame.filename)
-        ret.append('(')
-        ret.append(str(upper_frame.lineno))
-        ret.append('): ')
-        ret.append(function_name)
-        if i != length - 1:
-            ret.append('\n')
-
-    return ''.join(ret)
-
-
-torch_all_reduce = dist.all_reduce
-torch_all_gather = dist.all_gather
-torch_reduce_scatter = dist.reduce_scatter
-torch_broadcast = dist.broadcast
-torch_reduce = dist.reduce
-
-
-class CommEvent(object):
-    """Communication Event. Used for communication time and communication
-    volume recording.
-    """
-
-    def __init__(self, count: int = 0, comm_vol: float = 0., cuda_time: int = 0):
-        self.self_count = count
-        self.self_comm_vol = comm_vol
-        self.self_cuda_time = cuda_time
-
-    def add(self, rhs):
-        self.self_count += rhs.self_count
-        self.self_comm_vol += rhs.self_comm_vol
-        self.self_cuda_time += rhs.self_cuda_time
-
-
-class CommProfiler(BaseProfiler):
-    """Communication profiler. Records all communication events.
-    """
-
-    def __init__(self, depth: int = 0, total_count: int = 0, total_comm_vol: float = 0, total_cuda_time: int = 0):
-        super().__init__(profiler_name="Collective_Communication", priority=0)
-        self.depth = 3 + depth
-        self.total_count = total_count
-        self.total_comm_vol = total_comm_vol
-        self.total_cuda_time = total_cuda_time
-
-        self.ops_record = dict()
-        self.profiler = None
-        self.pending_op = None
-        self.pending_metadata = None
-        self.warn_flag = False
-
-    def reset(self):
-        self.total_count = 0
-        self.total_comm_vol = 0
-        self.total_cuda_time = 0
-
-        self.ops_record = dict()
-        self.profiler = None
-        self.pending_op = None
-        self.pending_metadata = None
-        self.warn_flag = False
-
-    def enable(self):
-        dist.all_reduce = partial(all_reduce, profiler=self)
-        dist.all_gather = partial(all_gather, profiler=self)
-        dist.reduce_scatter = partial(reduce_scatter, profiler=self)
-        dist.broadcast = partial(broadcast, profiler=self)
-        dist.reduce = partial(reduce, profiler=self)
-
-    def disable(self):
-        dist.all_reduce = torch_all_reduce
-        dist.all_gather = torch_all_gather
-        dist.reduce_scatter = torch_reduce_scatter
-        dist.broadcast = torch_broadcast
-        dist.reduce = torch_reduce
-
-    def to_tensorboard(self, writer):
-        writer.add_text(tag="Collective Communication", text_string=self.result_str("\n\n"))
-
-    def to_file(self, filename: Path):
-        with open(filename, "w") as f:
-            f.write(self.result_str())
-
-    def show(self):
-        print(self.result_str())
-
-    def result_str(self, sep: str = "\n"):
-        res = []
-
-        def append(s: str = None):
-            if s is not None:
-                res.append(s)
-            res.append(sep)
-
-        if self.warn_flag:
-            append("Warning: there exists multiple communication operations in the same time. As a result, "
-                   "the profiling result is not accurate.")
-
-        if self.total_cuda_time == 0:
-            return "No collective communication has been called yet!"
-
-        append("Collective communication profiling result:")
-        append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
-        append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))
-        append("total number of calls: {}".format(self.total_count))
-        append("All events:")
-
-        separation = '-' * 74
-        row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2
-
-        append(separation)
-        append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
-        append(separation)
-
-        show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
-        for location, event in show_list:
-            append(location)
-            append(
-                row_format.format('', _format_time(event.self_cuda_time),
-                                  '{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0),
-                                  _format_memory(event.self_comm_vol),
-                                  _format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count))
-            append()
-
-        return ''.join(res)
-
-    @property
-    def has_aync_op(self):
-        return self.pending_op is not None
-
-    def activate_profiler(self, kn: str, vol: float):
-        self.pending_metadata = (kn, _get_code_location(self.depth), vol)
-        self.profiler = profile(enabled=True, use_cuda=True, use_cpu=True, use_kineto=True)
-        self.profiler.__enter__()
-
-    def close_profiler(self, group=None):
-        assert self.profiler is not None, "There is no running dist op"
-        kernel_name, code_location, vol = self.pending_metadata
-        self.profiler.__exit__(None, None, None)
-
-        if self.profiler.enabled and dist.get_world_size(group) > 1:
-            assert_flag = 0
-            current_comm_event = None
-            events = self.profiler.function_events
-            for event in events:
-                if kernel_name in event.name:
-                    assert assert_flag == 0, "Multiple dist ops has been called "
-                    current_comm_event = CommEvent(1, vol, event.self_cuda_time_total)
-                    assert_flag += 1
-
-            assert current_comm_event is not None, "dist op has not been found"
-
-            buffer = torch.tensor([current_comm_event.self_cuda_time], device=get_current_device())
-            torch_all_reduce(buffer, op=ReduceOp.MIN, group=group)
-            current_comm_event.self_cuda_time = buffer.item()
-
-            self.total_count += current_comm_event.self_count
-            self.total_comm_vol += current_comm_event.self_comm_vol
-            self.total_cuda_time += current_comm_event.self_cuda_time
-            if code_location in self.ops_record:
-                self.ops_record[code_location].add(current_comm_event)
-            else:
-                self.ops_record[code_location] = current_comm_event
-
-        self.profiler = None
-        self.pending_op = None
-        self.pending_metadata = None
-
-    def wait_async_op(self):
-        if self.pending_op is not None:
-            op = self.pending_op
-            op.wait()
-            self.close_profiler()
-
-
-class CommHandler(object):
-    """Communication handler. A dummy handler to wait aync operations.
-    """
-
-    def __init__(self, profiler: CommProfiler):
-        super().__init__()
-        self.prof = profiler
-
-    def wait(self):
-        self.prof.wait_async_op()
-
-
-def async_check(profiler: CommProfiler):
-    if profiler.pending_op is not None:
-        profiler.warn_flag = True
-        profiler.wait_async_op()
-
-
-def all_reduce(tensor: torch.Tensor,
-               op: ReduceOp = ReduceOp.SUM,
-               group=None,
-               async_op: bool = False,
-               profiler: CommProfiler = None) -> Optional[CommHandler]:
-    async_check(profiler)
-
-    comm_size = dist.get_world_size(group)
-    correction = 2 * (comm_size - 1) / comm_size
-    comm_vol = correction * tensor.element_size() * tensor.numel()
-    profiler.activate_profiler("ncclKernel_AllReduce_", comm_vol)
-    profiler.pending_op = torch_all_reduce(tensor, op, group, async_op)
-
-    if async_op:
-        return CommHandler(profiler)
-
-    profiler.close_profiler(group)
-
-
-def reduce_scatter(output: torch.Tensor,
-                   input_list: List[torch.Tensor],
-                   op: ReduceOp = ReduceOp.SUM,
-                   group=None,
-                   async_op: bool = False,
-                   profiler: CommProfiler = None) -> Optional[CommHandler]:
-    async_check(profiler)
-
-    comm_size = dist.get_world_size(group)
-    correction = (comm_size - 1) / comm_size
-    comm_vol = 0
-    for tensor in input_list:
-        comm_vol += tensor.element_size() * tensor.numel()
-    comm_vol *= correction
-    profiler.activate_profiler("ncclKernel_ReduceScatter_", comm_vol)
-    profiler.pending_op = torch_reduce_scatter(output, input_list, op, group, async_op)
-
-    if async_op:
-        return CommHandler(profiler)
-
-    profiler.close_profiler(group)
-
-
-def all_gather(tensor_list: List[torch.Tensor],
-               tensor: torch.Tensor,
-               group=None,
-               async_op: bool = False,
-               profiler: CommProfiler = None) -> Optional[CommHandler]:
-    async_check(profiler)
-
-    comm_size = dist.get_world_size(group)
-    correction = (comm_size - 1) / comm_size
-    comm_vol = 0
-    for ten in tensor_list:
-        comm_vol += ten.element_size() * ten.numel()
-    comm_vol *= correction
-    profiler.activate_profiler("ncclKernel_AllGather_", comm_vol)
-    profiler.pending_op = torch_all_gather(tensor_list, tensor, group, async_op)
-
-    if async_op:
-        return CommHandler(profiler)
-
-    profiler.close_profiler(group)
-
-
-def broadcast(tensor: torch.Tensor,
-              src: int,
-              group=None,
-              async_op: bool = False,
-              profiler: CommProfiler = None) -> Optional[CommHandler]:
-    async_check(profiler)
-
-    comm_vol = 1.0 * tensor.element_size() * tensor.numel()
-    profiler.activate_profiler("ncclKernel_Broadcast_", comm_vol)
-    profiler.pending_op = torch_broadcast(tensor, src, group, async_op)
-
-    if async_op:
-        return CommHandler(profiler)
-
-    profiler.close_profiler(group)
-
-
-def reduce(tensor: torch.Tensor,
-           dst: int,
-           op: ReduceOp = ReduceOp.SUM,
-           group=None,
-           async_op: bool = False,
-           profiler: CommProfiler = None) -> Optional[CommHandler]:
-    async_check(profiler)
-
-    comm_vol = 1.0 * tensor.element_size() * tensor.numel()
-    profiler.activate_profiler("ncclKernel_Reduce_", comm_vol)
-    profiler.pending_op = torch_reduce(tensor, dst, op, group, async_op)
-
-    if async_op:
-        return CommHandler(profiler)
-
-    profiler.close_profiler(group)
+import inspect
+from functools import partial
+from pathlib import Path
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+from torch.autograd.profiler import profile
+from torch.distributed import ReduceOp
+
+from colossalai.utils import get_current_device
+
+from .prof_utils import BaseProfiler, _format_bandwidth, _format_memory, _format_time
+
+
+def _get_code_location(depth: int):
+    ret = []
+    length = min(len(inspect.stack()), depth + 1)
+    for i in range(3, length):
+        upper_frame = inspect.stack()[i]
+        function_name = inspect.stack()[i - 1].function
+        ret.append(upper_frame.filename)
+        ret.append('(')
+        ret.append(str(upper_frame.lineno))
+        ret.append('): ')
+        ret.append(function_name)
+        if i != length - 1:
+            ret.append('\n')
+
+    return ''.join(ret)
+
+
+torch_all_reduce = dist.all_reduce
+torch_all_gather = dist.all_gather
+torch_reduce_scatter = dist.reduce_scatter
+torch_broadcast = dist.broadcast
+torch_reduce = dist.reduce
+
+
+class CommEvent(object):
+    """Communication Event. Used for communication time and communication
+    volume recording.
+    """
+
+    def __init__(self, count: int = 0, comm_vol: float = 0., cuda_time: int = 0):
+        self.self_count = count
+        self.self_comm_vol = comm_vol
+        self.self_cuda_time = cuda_time
+
+    def add(self, rhs):
+        self.self_count += rhs.self_count
+        self.self_comm_vol += rhs.self_comm_vol
+        self.self_cuda_time += rhs.self_cuda_time
+
+
+class CommProfiler(BaseProfiler):
+    """Communication profiler. Records all communication events.
+    """
+
+    def __init__(self, depth: int = 0, total_count: int = 0, total_comm_vol: float = 0, total_cuda_time: int = 0):
+        super().__init__(profiler_name="Collective_Communication", priority=0)
+        self.depth = 3 + depth
+        self.total_count = total_count
+        self.total_comm_vol = total_comm_vol
+        self.total_cuda_time = total_cuda_time
+
+        self.ops_record = dict()
+        self.profiler = None
+        self.pending_op = None
+        self.pending_metadata = None
+        self.warn_flag = False
+
+    def reset(self):
+        self.total_count = 0
+        self.total_comm_vol = 0
+        self.total_cuda_time = 0
+
+        self.ops_record = dict()
+        self.profiler = None
+        self.pending_op = None
+        self.pending_metadata = None
+        self.warn_flag = False
+
+    def enable(self):
+        dist.all_reduce = partial(all_reduce, profiler=self)
+        dist.all_gather = partial(all_gather, profiler=self)
+        dist.reduce_scatter = partial(reduce_scatter, profiler=self)
+        dist.broadcast = partial(broadcast, profiler=self)
+        dist.reduce = partial(reduce, profiler=self)
+
+    def disable(self):
+        dist.all_reduce = torch_all_reduce
+        dist.all_gather = torch_all_gather
+        dist.reduce_scatter = torch_reduce_scatter
+        dist.broadcast = torch_broadcast
+        dist.reduce = torch_reduce
+
+    def to_tensorboard(self, writer):
+        writer.add_text(tag="Collective Communication", text_string=self.result_str("\n\n"))
+
+    def to_file(self, filename: Path):
+        with open(filename, "w") as f:
+            f.write(self.result_str())
+
+    def show(self):
+        print(self.result_str())
+
+    def result_str(self, sep: str = "\n"):
+        res = []
+
+        def append(s: str = None):
+            if s is not None:
+                res.append(s)
+            res.append(sep)
+
+        if self.warn_flag:
+            append("Warning: there exists multiple communication operations in the same time. As a result, "
+                   "the profiling result is not accurate.")
+
+        if self.total_cuda_time == 0:
+            return "No collective communication has been called yet!"
+
+        append("Collective communication profiling result:")
+        append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
+        append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))
+        append("total number of calls: {}".format(self.total_count))
+        append("All events:")
+
+        separation = '-' * 74
+        row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2
+
+        append(separation)
+        append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
+        append(separation)
+
+        show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
+        for location, event in show_list:
+            append(location)
+            append(
+                row_format.format('', _format_time(event.self_cuda_time),
+                                  '{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0),
+                                  _format_memory(event.self_comm_vol),
+                                  _format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count))
+            append()
+
+        return ''.join(res)
+
+    @property
+    def has_aync_op(self):
+        return self.pending_op is not None
+
+    def activate_profiler(self, kn: str, vol: float):
+        self.pending_metadata = (kn, _get_code_location(self.depth), vol)
+        self.profiler = profile(enabled=True, use_cuda=True, use_cpu=True, use_kineto=True)
+        self.profiler.__enter__()
+
+    def close_profiler(self, group=None):
+        assert self.profiler is not None, "There is no running dist op"
+        kernel_name, code_location, vol = self.pending_metadata
+        self.profiler.__exit__(None, None, None)
+
+        if self.profiler.enabled and dist.get_world_size(group) > 1:
+            assert_flag = 0
+            current_comm_event = None
+            events = self.profiler.function_events
+            for event in events:
+                if kernel_name in event.name:
+                    assert assert_flag == 0, "Multiple dist ops has been called "
+                    current_comm_event = CommEvent(1, vol, event.self_cuda_time_total)
+                    assert_flag += 1
+
+            assert current_comm_event is not None, "dist op has not been found"
+
+            buffer = torch.tensor([current_comm_event.self_cuda_time], device=get_current_device())
+            torch_all_reduce(buffer, op=ReduceOp.MIN, group=group)
+            current_comm_event.self_cuda_time = buffer.item()
+
+            self.total_count += current_comm_event.self_count
+            self.total_comm_vol += current_comm_event.self_comm_vol
+            self.total_cuda_time += current_comm_event.self_cuda_time
+            if code_location in self.ops_record:
+                self.ops_record[code_location].add(current_comm_event)
+            else:
+                self.ops_record[code_location] = current_comm_event
+
+        self.profiler = None
+        self.pending_op = None
+        self.pending_metadata = None
+
+    def wait_async_op(self):
+        if self.pending_op is not None:
+            op = self.pending_op
+            op.wait()
+            self.close_profiler()
+
+
+class CommHandler(object):
+    """Communication handler. A dummy handler to wait aync operations.
+    """
+
+    def __init__(self, profiler: CommProfiler):
+        super().__init__()
+        self.prof = profiler
+
+    def wait(self):
+        self.prof.wait_async_op()
+
+
+def async_check(profiler: CommProfiler):
+    if profiler.pending_op is not None:
+        profiler.warn_flag = True
+        profiler.wait_async_op()
+
+
+def all_reduce(tensor: torch.Tensor,
+               op: ReduceOp = ReduceOp.SUM,
+               group=None,
+               async_op: bool = False,
+               profiler: CommProfiler = None) -> Optional[CommHandler]:
+    async_check(profiler)
+
+    comm_size = dist.get_world_size(group)
+    correction = 2 * (comm_size - 1) / comm_size
+    comm_vol = correction * tensor.element_size() * tensor.numel()
+    profiler.activate_profiler("ncclKernel_AllReduce_", comm_vol)
+    profiler.pending_op = torch_all_reduce(tensor, op, group, async_op)
+
+    if async_op:
+        return CommHandler(profiler)
+
+    profiler.close_profiler(group)
+
+
+def reduce_scatter(output: torch.Tensor,
+                   input_list: List[torch.Tensor],
+                   op: ReduceOp = ReduceOp.SUM,
+                   group=None,
+                   async_op: bool = False,
+                   profiler: CommProfiler = None) -> Optional[CommHandler]:
+    async_check(profiler)
+
+    comm_size = dist.get_world_size(group)
+    correction = (comm_size - 1) / comm_size
+    comm_vol = 0
+    for tensor in input_list:
+        comm_vol += tensor.element_size() * tensor.numel()
+    comm_vol *= correction
+    profiler.activate_profiler("ncclKernel_ReduceScatter_", comm_vol)
+    profiler.pending_op = torch_reduce_scatter(output, input_list, op, group, async_op)
+
+    if async_op:
+        return CommHandler(profiler)
+
+    profiler.close_profiler(group)
+
+
+def all_gather(tensor_list: List[torch.Tensor],
+               tensor: torch.Tensor,
+               group=None,
+               async_op: bool = False,
+               profiler: CommProfiler = None) -> Optional[CommHandler]:
+    async_check(profiler)
+
+    comm_size = dist.get_world_size(group)
+    correction = (comm_size - 1) / comm_size
+    comm_vol = 0
+    for ten in tensor_list:
+        comm_vol += ten.element_size() * ten.numel()
+    comm_vol *= correction
+    profiler.activate_profiler("ncclKernel_AllGather_", comm_vol)
+    profiler.pending_op = torch_all_gather(tensor_list, tensor, group, async_op)
+
+    if async_op:
+        return CommHandler(profiler)
+
+    profiler.close_profiler(group)
+
+
+def broadcast(tensor: torch.Tensor,
+              src: int,
+              group=None,
+              async_op: bool = False,
+              profiler: CommProfiler = None) -> Optional[CommHandler]:
+    async_check(profiler)
+
+    comm_vol = 1.0 * tensor.element_size() * tensor.numel()
+    profiler.activate_profiler("ncclKernel_Broadcast_", comm_vol)
+    profiler.pending_op = torch_broadcast(tensor, src, group, async_op)
+
+    if async_op:
+        return CommHandler(profiler)
+
+    profiler.close_profiler(group)
+
+
+def reduce(tensor: torch.Tensor,
+           dst: int,
+           op: ReduceOp = ReduceOp.SUM,
+           group=None,
+           async_op: bool = False,
+           profiler: CommProfiler = None) -> Optional[CommHandler]:
+    async_check(profiler)
+
+    comm_vol = 1.0 * tensor.element_size() * tensor.numel()
+    profiler.activate_profiler("ncclKernel_Reduce_", comm_vol)
+    profiler.pending_op = torch_reduce(tensor, dst, op, group, async_op)
+
+    if async_op:
+        return CommHandler(profiler)
+
+    profiler.close_profiler(group)
diff --git a/colossalai/utils/profiler/legacy/pcie_profiler.py b/colossalai/legacy/utils/profiler/legacy/pcie_profiler.py
similarity index 95%
rename from colossalai/utils/profiler/legacy/pcie_profiler.py
rename to colossalai/legacy/utils/profiler/legacy/pcie_profiler.py
index 8f812f5cf..514d3c6fa 100644
--- a/colossalai/utils/profiler/legacy/pcie_profiler.py
+++ b/colossalai/legacy/utils/profiler/legacy/pcie_profiler.py
@@ -1,148 +1,150 @@
-from pathlib import Path
-from torch.autograd.profiler import profile
-from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
-from typing import List
-
-
-def _get_size(dtype: str):
-    if dtype == "fp16":
-        return 2
-    elif dtype == "fp32":
-        return 4
-    else:
-        raise NotImplementedError
-
-
-def _get_numel(my_list: List[int]) -> int:
-    from functools import reduce
-    from operator import mul
-    return reduce(mul, my_list)
-
-
-def _reduce_location(locations: List[str]) -> str:
-    ret = []
-    for lo in locations:
-        ret.append(lo)
-        ret.append("\n")
-    ret = ret[:-1]
-    return ''.join(ret)
-
-
-class PcieEvent(object):
-    """Pcie Event.
-    """
-
-    def __init__(self, count: int = 0, pcie_vol: int = 0, cuda_time: int = 0):
-        self.count = count
-        self.pcie_vol = pcie_vol
-        self.cuda_time = cuda_time
-
-    def add(self, rhs):
-        self.count += rhs.count
-        self.pcie_vol += rhs.pcie_vol
-        self.cuda_time += rhs.cuda_time
-
-
-class PcieProfiler(BaseProfiler):
-    """Pcie profiler. Records all data transmission between CPU and GPU.
-
-    TODO: Merge pcie profiler into communication profiler
-    """
-
-    def __init__(self, dtype: str = "fp32", depth: int = 1):
-        super().__init__(profiler_name="Pcie", priority=10)
-        self.depth = depth
-        self.data_size = _get_size(dtype)
-        self.h2d_count = 0
-        self.h2d_time = 0
-        self.d2h_count = 0
-        self.d2h_time = 0
-
-        self.ops_record = dict()
-        self.profiler = None
-
-    def reset(self):
-        self.h2d_count = 0
-        self.h2d_time = 0
-        self.d2h_count = 0
-        self.d2h_time = 0
-
-        self.ops_record = dict()
-        self.profiler = None
-
-    def enable(self):
-        self.profiler = profile(enabled=True,
-                                use_cuda=True,
-                                use_cpu=True,
-                                use_kineto=True,
-                                record_shapes=True,
-                                with_stack=True)
-        self.profiler.__enter__()
-
-    def disable(self):
-        self.profiler.__exit__(None, None, None)
-
-        if self.profiler.enabled:
-            events = self.profiler.function_events
-            for event in events:
-                if event.name == "aten::copy_":
-                    t_shape = event.input_shapes[0]
-                    if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0:
-                        continue
-                    current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)
-                    code_location = _reduce_location(event.stack[:self.depth])
-                    if code_location in self.ops_record:
-                        self.ops_record[code_location].add(current_comm_event)
-                    else:
-                        self.ops_record[code_location] = current_comm_event
-                elif 'Memcpy HtoD' in event.name:
-                    self.h2d_count += 1
-                    self.h2d_time += event.cuda_time_total
-                elif 'Memcpy DtoH' in event.name:
-                    self.d2h_count += 1
-                    self.d2h_time += event.cuda_time_total
-
-        self.profiler = None
-
-    def to_tensorboard(self, writer):
-        writer.add_text(tag="Data Transmission", text_string=self.result_str("\n\n"))
-
-    def to_file(self, filename: Path):
-        with open(filename, "w") as f:
-            f.write(self.result_str())
-
-    def show(self):
-        print(self.result_str())
-
-    def result_str(self, sep: str = "\n"):
-        res = []
-
-        def append(s: str = None):
-            if s is not None:
-                res.append(s)
-            res.append(sep)
-
-        append("Pcie profiling result:")
-        append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time)))
-        append("number of transmission (CPU -> GPU): {}".format(self.h2d_count))
-        append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time)))
-        append("number of transmission (GPU -> CPU): {}".format(self.d2h_count))
-
-        append("Possible data transmission events in PCIE:")
-
-        separation = '-' * 62
-        row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2
-
-        append(separation)
-        append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
-        append(separation)
-
-        show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
-        for location, event in show_list:
-            append(location)
-            append(
-                row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol),
-                                  _format_bandwidth(event.pcie_vol, event.cuda_time), event.count))
-            append()
-
-        return ''.join(res)
+from pathlib import Path
+from typing import List
+
+from torch.autograd.profiler import profile
+
+from .prof_utils import BaseProfiler, _format_bandwidth, _format_memory, _format_time
+
+
+def _get_size(dtype: str):
+    if dtype == "fp16":
+        return 2
+    elif dtype == "fp32":
+        return 4
+    else:
+        raise NotImplementedError
+
+
+def _get_numel(my_list: List[int]) -> int:
+    from functools import reduce
+    from operator import mul
+    return reduce(mul, my_list)
+
+
+def _reduce_location(locations: List[str]) -> str:
+    ret = []
+    for lo in locations:
+        ret.append(lo)
+        ret.append("\n")
+    ret = ret[:-1]
+    return ''.join(ret)
+
+
+class PcieEvent(object):
+    """Pcie Event.
+    """
+
+    def __init__(self, count: int = 0, pcie_vol: int = 0, cuda_time: int = 0):
+        self.count = count
+        self.pcie_vol = pcie_vol
+        self.cuda_time = cuda_time
+
+    def add(self, rhs):
+        self.count += rhs.count
+        self.pcie_vol += rhs.pcie_vol
+        self.cuda_time += rhs.cuda_time
+
+
+class PcieProfiler(BaseProfiler):
+    """Pcie profiler. Records all data transmission between CPU and GPU.
+
+    TODO: Merge pcie profiler into communication profiler
+    """
+
+    def __init__(self, dtype: str = "fp32", depth: int = 1):
+        super().__init__(profiler_name="Pcie", priority=10)
+        self.depth = depth
+        self.data_size = _get_size(dtype)
+        self.h2d_count = 0
+        self.h2d_time = 0
+        self.d2h_count = 0
+        self.d2h_time = 0
+
+        self.ops_record = dict()
+        self.profiler = None
+
+    def reset(self):
+        self.h2d_count = 0
+        self.h2d_time = 0
+        self.d2h_count = 0
+        self.d2h_time = 0
+
+        self.ops_record = dict()
+        self.profiler = None
+
+    def enable(self):
+        self.profiler = profile(enabled=True,
+                                use_cuda=True,
+                                use_cpu=True,
+                                use_kineto=True,
+                                record_shapes=True,
+                                with_stack=True)
+        self.profiler.__enter__()
+
+    def disable(self):
+        self.profiler.__exit__(None, None, None)
+
+        if self.profiler.enabled:
+            events = self.profiler.function_events
+            for event in events:
+                if event.name == "aten::copy_":
+                    t_shape = event.input_shapes[0]
+                    if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0:
+                        continue
+                    current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)
+                    code_location = _reduce_location(event.stack[:self.depth])
+                    if code_location in self.ops_record:
+                        self.ops_record[code_location].add(current_comm_event)
+                    else:
+                        self.ops_record[code_location] = current_comm_event
+                elif 'Memcpy HtoD' in event.name:
+                    self.h2d_count += 1
+                    self.h2d_time += event.cuda_time_total
+                elif 'Memcpy DtoH' in event.name:
+                    self.d2h_count += 1
+                    self.d2h_time += event.cuda_time_total
+
+        self.profiler = None
+
+    def to_tensorboard(self, writer):
+        writer.add_text(tag="Data Transmission", text_string=self.result_str("\n\n"))
+
+    def to_file(self, filename: Path):
+        with open(filename, "w") as f:
+            f.write(self.result_str())
+
+    def show(self):
+        print(self.result_str())
+
+    def result_str(self, sep: str = "\n"):
+        res = []
+
+        def append(s: str = None):
+            if s is not None:
+                res.append(s)
+            res.append(sep)
+
+        append("Pcie profiling result:")
+        append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time)))
+        append("number of transmission (CPU -> GPU): {}".format(self.h2d_count))
+        append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time)))
+        append("number of transmission (GPU -> CPU): {}".format(self.d2h_count))
+
+        append("Possible data transmission events in PCIE:")
+
+        separation = '-' * 62
+        row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2
+
+        append(separation)
+        append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
+        append(separation)
+
+        show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
+        for location, event in show_list:
+            append(location)
+            append(
+                row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol),
+                                  _format_bandwidth(event.pcie_vol, event.cuda_time), event.count))
+            append()
+
+        return ''.join(res)
diff --git a/colossalai/utils/profiler/legacy/prof_utils.py b/colossalai/legacy/utils/profiler/legacy/prof_utils.py
similarity index 94%
rename from colossalai/utils/profiler/legacy/prof_utils.py
rename to colossalai/legacy/utils/profiler/legacy/prof_utils.py
index 2f7eee827..9b948c9ec 100644
--- a/colossalai/utils/profiler/legacy/prof_utils.py
+++ b/colossalai/legacy/utils/profiler/legacy/prof_utils.py
@@ -1,131 +1,132 @@
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Union, List
-from colossalai.core import global_context as gpc
-
-
-# copied from high version pytorch to support low version
-def _format_time(time_us):
-    """Defines how to format time in FunctionEvent"""
-    US_IN_SECOND = 1000.0 * 1000.0
-    US_IN_MS = 1000.0
-    if time_us >= US_IN_SECOND:
-        return '{:.3f}s'.format(time_us / US_IN_SECOND)
-    if time_us >= US_IN_MS:
-        return '{:.3f}ms'.format(time_us / US_IN_MS)
-    return '{:.3f}us'.format(time_us)
-
-
-# copied from high version pytorch to support low version
-def _format_memory(nbytes):
-    """Returns a formatted memory size string"""
-    KB = 1024
-    MB = 1024 * KB
-    GB = 1024 * MB
-    if (abs(nbytes) >= GB):
-        return '{:.2f} GB'.format(nbytes * 1.0 / GB)
-    elif (abs(nbytes) >= MB):
-        return '{:.2f} MB'.format(nbytes * 1.0 / MB)
-    elif (abs(nbytes) >= KB):
-        return '{:.2f} KB'.format(nbytes * 1.0 / KB)
-    else:
-        return str(nbytes) + ' B'
-
-
-def _format_bandwidth(volume: float or int, time_us: int):
-    sec_div_mb = (1000.0 / 1024.0)**2
-    mb_per_sec = volume / time_us * sec_div_mb
-
-    if mb_per_sec >= 1024.0:
-        return '{:.3f} GB/s'.format(mb_per_sec / 1024.0)
-    else:
-        return '{:.3f} MB/s'.format(mb_per_sec)
-
-
-class BaseProfiler(ABC):
-
-    def __init__(self, profiler_name: str, priority: int):
-        self.name = profiler_name
-        self.priority = priority
-
-    @abstractmethod
-    def enable(self):
-        pass
-
-    @abstractmethod
-    def disable(self):
-        pass
-
-    @abstractmethod
-    def to_tensorboard(self, writer):
-        pass
-
-    @abstractmethod
-    def to_file(self, filename: Path):
-        pass
-
-    @abstractmethod
-    def show(self):
-        pass
-
-
-class ProfilerContext(object):
-    """Profiler context manager
-
-    Usage::
-
-        world_size = 4
-        inputs = torch.randn(10, 10, dtype=torch.float32, device=get_current_device())
-        outputs = torch.empty(world_size, 10, 10, dtype=torch.float32, device=get_current_device())
-        outputs_list = list(torch.chunk(outputs, chunks=world_size, dim=0))
-
-        cc_prof = CommProfiler()
-
-        with ProfilerContext([cc_prof]) as prof:
-            op = dist.all_reduce(inputs, async_op=True)
-            dist.all_gather(outputs_list, inputs)
-            op.wait()
-            dist.reduce_scatter(inputs, outputs_list)
-            dist.broadcast(inputs, 0)
-            dist.reduce(inputs, 0)
-
-        prof.show()
-    """
-
-    def __init__(self, profilers: List[BaseProfiler] = None, enable: bool = True):
-        self.enable = enable
-        self.profilers = sorted(profilers, key=lambda prof: prof.priority)
-
-    def __enter__(self):
-        if self.enable:
-            for prof in self.profilers:
-                prof.enable()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.enable:
-            for prof in self.profilers:
-                prof.disable()
-
-    def to_tensorboard(self, writer):
-        from torch.utils.tensorboard import SummaryWriter
-
-        assert isinstance(writer, SummaryWriter), \
-            f'torch.utils.tensorboard.SummaryWriter is required, but found {type(writer)}.'
-
-        for prof in self.profilers:
-            prof.to_tensorboard(writer)
-
-    def to_file(self, log_dir: Union[str, Path]):
-        if isinstance(log_dir, str):
-            log_dir = Path(log_dir)
-
-        if not log_dir.exists():
-            log_dir.mkdir(parents=True, exist_ok=True)
-        for prof in self.profilers:
-            log_file = log_dir.joinpath(f'{prof.name}_rank_{gpc.get_global_rank()}.log')
-            prof.to_file(log_file)
-
-    def show(self):
-        for prof in self.profilers:
-            prof.show()
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, Union
+
+from colossalai.legacy.core import global_context as gpc
+
+
+# copied from high version pytorch to support low version
+def _format_time(time_us):
+    """Defines how to format time in FunctionEvent"""
+    US_IN_SECOND = 1000.0 * 1000.0
+    US_IN_MS = 1000.0
+    if time_us >= US_IN_SECOND:
+        return '{:.3f}s'.format(time_us / US_IN_SECOND)
+    if time_us >= US_IN_MS:
+        return '{:.3f}ms'.format(time_us / US_IN_MS)
+    return '{:.3f}us'.format(time_us)
+
+
+# copied from high version pytorch to support low version
+def _format_memory(nbytes):
+    """Returns a formatted memory size string"""
+    KB = 1024
+    MB = 1024 * KB
+    GB = 1024 * MB
+    if (abs(nbytes) >= GB):
+        return '{:.2f} GB'.format(nbytes * 1.0 / GB)
+    elif (abs(nbytes) >= MB):
+        return '{:.2f} MB'.format(nbytes * 1.0 / MB)
+    elif (abs(nbytes) >= KB):
+        return '{:.2f} KB'.format(nbytes * 1.0 / KB)
+    else:
+        return str(nbytes) + ' B'
+
+
+def _format_bandwidth(volume: float or int, time_us: int):
+    sec_div_mb = (1000.0 / 1024.0)**2
+    mb_per_sec = volume / time_us * sec_div_mb
+
+    if mb_per_sec >= 1024.0:
+        return '{:.3f} GB/s'.format(mb_per_sec / 1024.0)
+    else:
+        return '{:.3f} MB/s'.format(mb_per_sec)
+
+
+class BaseProfiler(ABC):
+
+    def __init__(self, profiler_name: str, priority: int):
+        self.name = profiler_name
+        self.priority = priority
+
+    @abstractmethod
+    def enable(self):
+        pass
+
+    @abstractmethod
+    def disable(self):
+        pass
+
+    @abstractmethod
+    def to_tensorboard(self, writer):
+        pass
+
+    @abstractmethod
+    def to_file(self, filename: Path):
+        pass
+
+    @abstractmethod
+    def show(self):
+        pass
+
+
+class ProfilerContext(object):
+    """Profiler context manager
+
+    Usage::
+
+        world_size = 4
+        inputs = torch.randn(10, 10, dtype=torch.float32, device=get_current_device())
+        outputs = torch.empty(world_size, 10, 10, dtype=torch.float32, device=get_current_device())
+        outputs_list = list(torch.chunk(outputs, chunks=world_size, dim=0))
+
+        cc_prof = CommProfiler()
+
+        with ProfilerContext([cc_prof]) as prof:
+            op = dist.all_reduce(inputs, async_op=True)
+            dist.all_gather(outputs_list, inputs)
+            op.wait()
+            dist.reduce_scatter(inputs, outputs_list)
+            dist.broadcast(inputs, 0)
+            dist.reduce(inputs, 0)
+
+        prof.show()
+    """
+
+    def __init__(self, profilers: List[BaseProfiler] = None, enable: bool = True):
+        self.enable = enable
+        self.profilers = sorted(profilers, key=lambda prof: prof.priority)
+
+    def __enter__(self):
+        if self.enable:
+            for prof in self.profilers:
+                prof.enable()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.enable:
+            for prof in self.profilers:
+                prof.disable()
+
+    def to_tensorboard(self, writer):
+        from torch.utils.tensorboard import SummaryWriter
+
+        assert isinstance(writer, SummaryWriter), \
+            f'torch.utils.tensorboard.SummaryWriter is required, but found {type(writer)}.'
+
+        for prof in self.profilers:
+            prof.to_tensorboard(writer)
+
+    def to_file(self, log_dir: Union[str, Path]):
+        if isinstance(log_dir, str):
+            log_dir = Path(log_dir)
+
+        if not log_dir.exists():
+            log_dir.mkdir(parents=True, exist_ok=True)
+        for prof in self.profilers:
+            log_file = log_dir.joinpath(f'{prof.name}_rank_{gpc.get_global_rank()}.log')
+            prof.to_file(log_file)
+
+    def show(self):
+        for prof in self.profilers:
+            prof.show()
diff --git a/colossalai/utils/profiler/profiler.py b/colossalai/legacy/utils/profiler/profiler.py
similarity index 97%
rename from colossalai/utils/profiler/profiler.py
rename to colossalai/legacy/utils/profiler/profiler.py
index 3026d723d..0827f06b5 100644
--- a/colossalai/utils/profiler/profiler.py
+++ b/colossalai/legacy/utils/profiler/profiler.py
@@ -9,9 +9,9 @@ from torch.profiler import profile as torch_profile
 from torch.profiler.profiler import ProfilerAction
 
 from colossalai.legacy.engine import Engine
+from colossalai.legacy.utils.profiler.extention import ProfilerExtension
+from colossalai.legacy.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention
 from colossalai.logging import get_dist_logger
-from colossalai.utils.profiler.extention import ProfilerExtension
-from colossalai.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention
 
 
 class profile(torch_profile):
diff --git a/colossalai/utils/profiler/stateful_tensor_mem_extention.py b/colossalai/legacy/utils/profiler/stateful_tensor_mem_extention.py
similarity index 98%
rename from colossalai/utils/profiler/stateful_tensor_mem_extention.py
rename to colossalai/legacy/utils/profiler/stateful_tensor_mem_extention.py
index 412bd7277..f3bb66ced 100644
--- a/colossalai/utils/profiler/stateful_tensor_mem_extention.py
+++ b/colossalai/legacy/utils/profiler/stateful_tensor_mem_extention.py
@@ -9,7 +9,7 @@ import torch
 from colossalai.gemini.ophooks import BaseOpHook
 from colossalai.gemini.stateful_tensor import StatefulTensor
 from colossalai.legacy.engine import Engine
-from colossalai.utils.profiler.extention import ProfilerExtension
+from colossalai.legacy.utils.profiler.extention import ProfilerExtension
 
 
 class DeviceType(Enum):
diff --git a/colossalai/zero/legacy/__init__.py b/colossalai/legacy/zero/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/__init__.py
rename to colossalai/legacy/zero/__init__.py
diff --git a/colossalai/zero/legacy/gemini/__init__.py b/colossalai/legacy/zero/gemini/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/__init__.py
rename to colossalai/legacy/zero/gemini/__init__.py
diff --git a/colossalai/zero/legacy/gemini/gemini_context.py b/colossalai/legacy/zero/gemini/gemini_context.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/gemini_context.py
rename to colossalai/legacy/zero/gemini/gemini_context.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/__init__.py b/colossalai/legacy/zero/gemini/ophooks/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/__init__.py
rename to colossalai/legacy/zero/gemini/ophooks/__init__.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py b/colossalai/legacy/zero/gemini/ophooks/_shard_grad_ophook.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/_shard_grad_ophook.py
rename to colossalai/legacy/zero/gemini/ophooks/_shard_grad_ophook.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py b/colossalai/legacy/zero/gemini/ophooks/_shard_param_ophook.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/_shard_param_ophook.py
rename to colossalai/legacy/zero/gemini/ophooks/_shard_param_ophook.py
diff --git a/colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py b/colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py
similarity index 98%
rename from colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py
rename to colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py
index f40d6ced1..eebcf86e0 100644
--- a/colossalai/zero/legacy/gemini/ophooks/runtime_mem_tracer_hook.py
+++ b/colossalai/legacy/zero/gemini/ophooks/runtime_mem_tracer_hook.py
@@ -5,9 +5,9 @@ from typing import List
 
 import torch
 
+from colossalai.legacy.zero.gemini.tensor_utils import alloc_storage, free_storage
 from colossalai.tensor.param_op_hook import ColoParamOpHook
 from colossalai.zero.gemini.memory_tracer import MemStats, SyncCudaMemoryMonitor
-from colossalai.zero.legacy.gemini.tensor_utils import alloc_storage, free_storage
 
 
 class TrainingPhase(Enum):
diff --git a/colossalai/zero/legacy/gemini/ophooks/utils.py b/colossalai/legacy/zero/gemini/ophooks/utils.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/ophooks/utils.py
rename to colossalai/legacy/zero/gemini/ophooks/utils.py
diff --git a/colossalai/zero/legacy/gemini/paramhooks/__init__.py b/colossalai/legacy/zero/gemini/paramhooks/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/paramhooks/__init__.py
rename to colossalai/legacy/zero/gemini/paramhooks/__init__.py
diff --git a/colossalai/zero/legacy/gemini/paramhooks/_param_hookmgr.py b/colossalai/legacy/zero/gemini/paramhooks/_param_hookmgr.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/paramhooks/_param_hookmgr.py
rename to colossalai/legacy/zero/gemini/paramhooks/_param_hookmgr.py
diff --git a/colossalai/zero/legacy/gemini/stateful_tensor.py b/colossalai/legacy/zero/gemini/stateful_tensor.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/stateful_tensor.py
rename to colossalai/legacy/zero/gemini/stateful_tensor.py
diff --git a/colossalai/zero/legacy/gemini/stateful_tensor_mgr.py b/colossalai/legacy/zero/gemini/stateful_tensor_mgr.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/stateful_tensor_mgr.py
rename to colossalai/legacy/zero/gemini/stateful_tensor_mgr.py
diff --git a/colossalai/zero/legacy/gemini/tensor_placement_policy.py b/colossalai/legacy/zero/gemini/tensor_placement_policy.py
similarity index 98%
rename from colossalai/zero/legacy/gemini/tensor_placement_policy.py
rename to colossalai/legacy/zero/gemini/tensor_placement_policy.py
index 165ae51fe..275933ec2 100644
--- a/colossalai/zero/legacy/gemini/tensor_placement_policy.py
+++ b/colossalai/legacy/zero/gemini/tensor_placement_policy.py
@@ -5,8 +5,8 @@ from typing import List, Optional, Type
 
 import torch
 
+from colossalai.legacy.utils.memory import colo_device_memory_capacity
 from colossalai.utils import get_current_device
-from colossalai.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector
 
 from .stateful_tensor import StatefulTensor
diff --git a/colossalai/zero/legacy/gemini/tensor_utils.py b/colossalai/legacy/zero/gemini/tensor_utils.py
similarity index 100%
rename from colossalai/zero/legacy/gemini/tensor_utils.py
rename to colossalai/legacy/zero/gemini/tensor_utils.py
diff --git a/colossalai/zero/legacy/init_ctx/__init__.py b/colossalai/legacy/zero/init_ctx/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/init_ctx/__init__.py
rename to colossalai/legacy/zero/init_ctx/__init__.py
diff --git a/colossalai/zero/legacy/init_ctx/init_context.py b/colossalai/legacy/zero/init_ctx/init_context.py
similarity index 96%
rename from colossalai/zero/legacy/init_ctx/init_context.py
rename to colossalai/legacy/zero/init_ctx/init_context.py
index 84e2d2f4f..4a7e46408 100644
--- a/colossalai/zero/legacy/init_ctx/init_context.py
+++ b/colossalai/legacy/zero/init_ctx/init_context.py
@@ -8,15 +8,15 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 
-from colossalai.context.parallel_mode import ParallelMode
 from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
+from colossalai.legacy.zero.sharded_model.sharded_model_v2 import ShardedModelV2
+from colossalai.legacy.zero.sharded_param import ShardedParamV2
 from colossalai.logging import get_dist_logger
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_bf16, cast_tensor_to_fp16
-from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
-from colossalai.zero.legacy.sharded_param import ShardedParamV2
 
 
 @dataclass
diff --git a/colossalai/zero/legacy/shard_utils/__init__.py b/colossalai/legacy/zero/shard_utils/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/shard_utils/__init__.py
rename to colossalai/legacy/zero/shard_utils/__init__.py
diff --git a/colossalai/zero/legacy/shard_utils/base_shard_strategy.py b/colossalai/legacy/zero/shard_utils/base_shard_strategy.py
similarity index 90%
rename from colossalai/zero/legacy/shard_utils/base_shard_strategy.py
rename to colossalai/legacy/zero/shard_utils/base_shard_strategy.py
index 7ca951091..9fb80f57a 100644
--- a/colossalai/zero/legacy/shard_utils/base_shard_strategy.py
+++ b/colossalai/legacy/zero/shard_utils/base_shard_strategy.py
@@ -3,7 +3,7 @@ from typing import List, Optional
 
 import torch.distributed as dist
 
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 
 
 class BaseShardStrategy(ABC):
diff --git a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py b/colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py
similarity index 97%
rename from colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
rename to colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py
index d66310483..1f7baad57 100644
--- a/colossalai/zero/legacy/shard_utils/bucket_tensor_shard_strategy.py
+++ b/colossalai/legacy/zero/shard_utils/bucket_tensor_shard_strategy.py
@@ -4,8 +4,8 @@ import torch
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors as flatten
 
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 
 from .tensor_shard_strategy import TensorShardStrategy
 
diff --git a/colossalai/zero/legacy/shard_utils/commons.py b/colossalai/legacy/zero/shard_utils/commons.py
similarity index 100%
rename from colossalai/zero/legacy/shard_utils/commons.py
rename to colossalai/legacy/zero/shard_utils/commons.py
diff --git a/colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py b/colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py
similarity index 90%
rename from colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py
rename to colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py
index d1df4803b..cc43907f6 100644
--- a/colossalai/zero/legacy/shard_utils/tensor_shard_strategy.py
+++ b/colossalai/legacy/zero/shard_utils/tensor_shard_strategy.py
@@ -3,11 +3,11 @@ from typing import List, Optional
 import torch
 import torch.distributed as dist
 
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.shard_utils.commons import get_shard
+from colossalai.legacy.zero.sharded_param.sharded_tensor import ShardedTensor
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.shard_utils.commons import get_shard
-from colossalai.zero.legacy.sharded_param.sharded_tensor import ShardedTensor
 
 
 class TensorShardStrategy(BaseShardStrategy):
diff --git a/colossalai/zero/legacy/sharded_model/__init__.py b/colossalai/legacy/zero/sharded_model/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_model/__init__.py
rename to colossalai/legacy/zero/sharded_model/__init__.py
diff --git a/colossalai/zero/legacy/sharded_model/_utils.py b/colossalai/legacy/zero/sharded_model/_utils.py
similarity index 97%
rename from colossalai/zero/legacy/sharded_model/_utils.py
rename to colossalai/legacy/zero/sharded_model/_utils.py
index f1d642cf3..b8a618ef5 100644
--- a/colossalai/zero/legacy/sharded_model/_utils.py
+++ b/colossalai/legacy/zero/sharded_model/_utils.py
@@ -3,7 +3,7 @@ from typing import Any, Callable, List, Tuple, Union
 import torch
 import torch.nn.functional as F
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor
 
 
 def get_gradient_predivide_factor(world_size: int) -> float:
diff --git a/colossalai/zero/legacy/sharded_model/reduce_scatter.py b/colossalai/legacy/zero/sharded_model/reduce_scatter.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_model/reduce_scatter.py
rename to colossalai/legacy/zero/sharded_model/reduce_scatter.py
diff --git a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py b/colossalai/legacy/zero/sharded_model/sharded_model_v2.py
similarity index 97%
rename from colossalai/zero/legacy/sharded_model/sharded_model_v2.py
rename to colossalai/legacy/zero/sharded_model/sharded_model_v2.py
index e7064277f..91c21ccf9 100644
--- a/colossalai/zero/legacy/sharded_model/sharded_model_v2.py
+++ b/colossalai/legacy/zero/sharded_model/sharded_model_v2.py
@@ -11,20 +11,20 @@ import torch.nn as nn
 from torch.distributed import ProcessGroup
 from torch.nn.parameter import Parameter
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils.memory import colo_device_memory_capacity
+from colossalai.legacy.zero.gemini.ophooks import register_ophooks_recursively
+from colossalai.legacy.zero.gemini.paramhooks import BaseParamHookMgr
+from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.legacy.zero.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_move_to_cpu
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
+from colossalai.legacy.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
 from colossalai.logging import get_dist_logger
 from colossalai.utils import disposable, get_current_device
-from colossalai.utils.memory import colo_device_memory_capacity
-from colossalai.zero.gemini.memory_tracer import MemStatsCollector, StaticMemStatsCollector
-from colossalai.zero.legacy.gemini.ophooks import register_ophooks_recursively
-from colossalai.zero.legacy.gemini.paramhooks import BaseParamHookMgr
-from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
-from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.zero.legacy.gemini.tensor_placement_policy import TensorPlacementPolicy, TensorPlacementPolicyFactory
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_move_to_cpu
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
-from colossalai.zero.legacy.sharded_model.reduce_scatter import ReduceScatterBucketer
+from colossalai.zero.gemini.memory_tracer import MemStatsCollector
 
 from ._utils import (
     cast_float_arguments,
diff --git a/colossalai/zero/legacy/sharded_model/utils.py b/colossalai/legacy/zero/sharded_model/utils.py
similarity index 92%
rename from colossalai/zero/legacy/sharded_model/utils.py
rename to colossalai/legacy/zero/sharded_model/utils.py
index 08806e78e..7a4116699 100644
--- a/colossalai/zero/legacy/sharded_model/utils.py
+++ b/colossalai/legacy/zero/sharded_model/utils.py
@@ -2,7 +2,7 @@ import copy
 
 import torch
 
-from colossalai.zero.legacy.sharded_model import ShardedModelV2
+from colossalai.legacy.zero.sharded_model import ShardedModelV2
 
 
 def col_model_deepcopy(sharded_model: ShardedModelV2, other_model: torch.nn.Module):
diff --git a/colossalai/zero/legacy/sharded_model/zero_hook.py b/colossalai/legacy/zero/sharded_model/zero_hook.py
similarity index 94%
rename from colossalai/zero/legacy/sharded_model/zero_hook.py
rename to colossalai/legacy/zero/sharded_model/zero_hook.py
index 1815bee3a..3fc373e5c 100644
--- a/colossalai/zero/legacy/sharded_model/zero_hook.py
+++ b/colossalai/legacy/zero/sharded_model/zero_hook.py
@@ -4,13 +4,13 @@ import torch
 import torch.distributed as dist
 
 from colossalai.legacy.registry import OPHOOKS
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
+from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
+from colossalai.legacy.zero.shard_utils import BaseShardStrategy
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
 from colossalai.zero.gemini.memory_tracer import MemStatsCollector
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
-from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
-from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
-from colossalai.zero.legacy.shard_utils import BaseShardStrategy
 
 
 @OPHOOKS.register_module
diff --git a/colossalai/zero/legacy/sharded_optim/__init__.py b/colossalai/legacy/zero/sharded_optim/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_optim/__init__.py
rename to colossalai/legacy/zero/sharded_optim/__init__.py
diff --git a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py b/colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py
similarity index 97%
rename from colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
rename to colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py
index 41dd174cb..e21f1cea0 100644
--- a/colossalai/zero/legacy/sharded_optim/sharded_optim_v2.py
+++ b/colossalai/legacy/zero/sharded_optim/sharded_optim_v2.py
@@ -12,15 +12,15 @@ from torch.nn.parameter import Parameter
 from torch.optim import Optimizer
 
 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
+from colossalai.legacy.zero.sharded_model import ShardedModelV2
+from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_fp32
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
-from colossalai.zero.legacy.sharded_model import ShardedModelV2
-from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp32
 
 
 class OptimState(Enum):
@@ -28,7 +28,7 @@ class OptimState(Enum):
     UNSCALED = 2
 
 
-class ShardedOptimizerV2(ColossalaiOptimizer):
+class ShardedOptimizerV2(OptimizerWrapper):
     """A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO).
 
     By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
diff --git a/colossalai/zero/legacy/sharded_param/__init__.py b/colossalai/legacy/zero/sharded_param/__init__.py
similarity index 100%
rename from colossalai/zero/legacy/sharded_param/__init__.py
rename to colossalai/legacy/zero/sharded_param/__init__.py
diff --git a/colossalai/zero/legacy/sharded_param/sharded_param.py b/colossalai/legacy/zero/sharded_param/sharded_param.py
similarity index 96%
rename from colossalai/zero/legacy/sharded_param/sharded_param.py
rename to colossalai/legacy/zero/sharded_param/sharded_param.py
index 4bcc4b621..454a722cf 100644
--- a/colossalai/zero/legacy/sharded_param/sharded_param.py
+++ b/colossalai/legacy/zero/sharded_param/sharded_param.py
@@ -2,8 +2,8 @@ from typing import List, Optional, Tuple
 
 import torch
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
-from colossalai.zero.legacy.gemini.tensor_utils import colo_tensor_mem_usage
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.tensor_utils import colo_tensor_mem_usage
 
 from .sharded_tensor import ShardedTensor
 
diff --git a/colossalai/zero/legacy/sharded_param/sharded_tensor.py b/colossalai/legacy/zero/sharded_param/sharded_tensor.py
similarity index 94%
rename from colossalai/zero/legacy/sharded_param/sharded_tensor.py
rename to colossalai/legacy/zero/sharded_param/sharded_tensor.py
index af6031260..43c7576b9 100644
--- a/colossalai/zero/legacy/sharded_param/sharded_tensor.py
+++ b/colossalai/legacy/zero/sharded_param/sharded_tensor.py
@@ -1,6 +1,6 @@
 import torch
 
-from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
+from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
 
 
 class ShardedTensor(StatefulTensor):
diff --git a/colossalai/logging/logger.py b/colossalai/logging/logger.py
index f9abe4a2a..fd05ddf1d 100644
--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@@ -134,8 +134,6 @@ class DistributedLogger:
 
         Args:
             message (str): The message to be logged.
-            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
@@ -147,8 +145,6 @@ class DistributedLogger:
 
         Args:
             message (str): The message to be logged.
-            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
@@ -160,8 +156,6 @@ class DistributedLogger:
 
         Args:
             message (str): The message to be logged.
-            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
@@ -173,8 +167,6 @@ class DistributedLogger:
 
         Args:
             message (str): The message to be logged.
-            parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
-                The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
             ranks (List[int]): List of parallel ranks.
         """
         message_prefix = "{}:{} {}".format(*self.__get_call_info())
diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
index edd986ef5..9aeab9f44 100644
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
@@ -1,2 +1,2 @@
-from .moe import *
+# from .moe import *
 from .utils import *
diff --git a/colossalai/nn/layer/moe/experts.py b/colossalai/nn/layer/moe/experts.py
index 56b11f4d9..712d872bb 100644
--- a/colossalai/nn/layer/moe/experts.py
+++ b/colossalai/nn/layer/moe/experts.py
@@ -6,10 +6,10 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 
-from colossalai.context import ParallelMode, seed
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.zero.init_ctx import no_shard_zero_decrator
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
 
 
 class MoeExperts(nn.Module):
diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py
index 03f55d91f..9293d3208 100644
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -6,6 +6,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 from colossalai.nn.layer.moe._operation import (
     COL_MOE_KERNEL_FLAG,
     AllGather,
@@ -18,7 +19,6 @@ from colossalai.nn.layer.moe.experts import Experts, MoeExperts
 from colossalai.nn.layer.moe.routers import MoeRouter, Top1Router, Top2Router
 from colossalai.nn.layer.moe.utils import NormalNoiseGenerator, UniformNoiseGenerator
 from colossalai.utils import get_current_device
-from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero_decrator
 
 
 @no_shard_zero_decrator(is_replicated=True)
diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py
index ee2add48a..7c6fb099d 100644
--- a/colossalai/nn/loss/__init__.py
+++ b/colossalai/nn/loss/__init__.py
@@ -1 +1 @@
-from .loss_moe import MoeCrossEntropyLoss, MoeLoss
+# from .loss_moe import MoeCrossEntropyLoss, MoeLoss
diff --git a/colossalai/nn/optimizer/__init__.py b/colossalai/nn/optimizer/__init__.py
index 06072648b..7e310793f 100644
--- a/colossalai/nn/optimizer/__init__.py
+++ b/colossalai/nn/optimizer/__init__.py
@@ -1,10 +1,9 @@
-from .colossalai_optimizer import ColossalaiOptimizer
+from .cpu_adam import CPUAdam
 from .fused_adam import FusedAdam
 from .fused_lamb import FusedLAMB
 from .fused_sgd import FusedSGD
+from .hybrid_adam import HybridAdam
 from .lamb import Lamb
 from .lars import Lars
-from .cpu_adam import CPUAdam
-from .hybrid_adam import HybridAdam
 
-__all__ = ['ColossalaiOptimizer', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam']
+__all__ = ['FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam']
diff --git a/colossalai/nn/optimizer/colossalai_optimizer.py b/colossalai/nn/optimizer/colossalai_optimizer.py
deleted file mode 100644
index 34f5a9541..000000000
--- a/colossalai/nn/optimizer/colossalai_optimizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-from torch import Tensor
-from torch.optim import Optimizer
-from colossalai.utils import clip_grad_norm_fp32
-
-
-class ColossalaiOptimizer(Optimizer):
-
-    def __init__(self, optim: Optimizer):
-        self.optim = optim
-
-    @property
-    def param_groups(self):
-        return self.optim.param_groups
-
-    @property
-    def defaults(self):
-        return self.optim.defaults
-
-    def add_param_group(self, *args, **kwargs):
-        return self.optim.add_param_group(*args, **kwargs)
-
-    def step(self, *args, **kwargs):
-        return self.optim.step(*args, **kwargs)
-
-    def zero_grad(self, *args, **kwargs):
-        self.optim.zero_grad(*args, **kwargs)
-
-    def load_state_dict(self, *args, **kwargs):
-        self.optim.load_state_dict(*args, **kwargs)
-
-    def state_dict(self):
-        return self.optim.state_dict()
-
-    def backward(self, loss: Tensor):
-        loss.backward()
-
-    def backward_by_grad(self, tensor: Tensor, grad: Tensor):
-        torch.autograd.backward(tensors=tensor, grad_tensors=grad)
-
-    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        if max_norm > 0.0:
-            clip_grad_norm_fp32(model.parameters(), max_norm)
diff --git a/colossalai/pipeline/__init__.py b/colossalai/pipeline/__init__.py
index 0fcde9707..e88a1f00a 100644
--- a/colossalai/pipeline/__init__.py
+++ b/colossalai/pipeline/__init__.py
@@ -1,4 +1,11 @@
-from .pipelinable import PipelinableContext, PipelinableModel
-from .layer_spec import LayerSpec
+from .p2p import PipelineP2PCommunication
+from .schedule import InterleavedSchedule, OneForwardOneBackwardSchedule, PipelineSchedule
+from .stage_manager import PipelineStageManager
 
-__all__ = ['PipelinableModel', 'PipelinableContext', 'LayerSpec']
\ No newline at end of file
+__all__ = [
+    'PipelineSchedule',
+    'OneForwardOneBackwardSchedule',
+    'InterleavedSchedule',
+    'PipelineP2PCommunication',
+    'PipelineStageManager',
+]
diff --git a/colossalai/pipeline/middleware/__init__.py b/colossalai/pipeline/middleware/__init__.py
deleted file mode 100644
index 79e19f9ea..000000000
--- a/colossalai/pipeline/middleware/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .topo import Topo, Partition, PartitionOutputVal, PartitionInputVal
-
-__all__ = ['Topo', 'Partition', 'PartitionOutputVal', 'PartitionInputVal']
\ No newline at end of file
diff --git a/colossalai/pipeline/rpc/__init__.py b/colossalai/pipeline/rpc/__init__.py
deleted file mode 100644
index 9d9e9d44f..000000000
--- a/colossalai/pipeline/rpc/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from ._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine, ChimeraPipelineEngine
-from .utils import pytree_map
-
-__all__ = ['FillDrainPipelineEngine', 'OneFOneBPipelineEngine', 'ChimeraPipelineEngine', 'pytree_map']
\ No newline at end of file
diff --git a/colossalai/pipeline/schedule/__init__.py b/colossalai/pipeline/schedule/__init__.py
index 8b13413b1..07c0f5927 100644
--- a/colossalai/pipeline/schedule/__init__.py
+++ b/colossalai/pipeline/schedule/__init__.py
@@ -1,7 +1,9 @@
 from .base import PipelineSchedule
+from .interleaved_pp import InterleavedSchedule
 from .one_f_one_b import OneForwardOneBackwardSchedule
 
 __all__ = [
     'PipelineSchedule',
     'OneForwardOneBackwardSchedule',
+    'InterleavedSchedule',
 ]
diff --git a/colossalai/tensor/__init__.py b/colossalai/tensor/__init__.py
index b2da64e6c..099376d93 100644
--- a/colossalai/tensor/__init__.py
+++ b/colossalai/tensor/__init__.py
@@ -1,18 +1,11 @@
-from . import distspec
 from .colo_parameter import ColoParameter
 from .colo_tensor import ColoTensor
 from .comm_spec import CollectiveCommPattern, CommSpec
-from .compute_spec import ComputePattern, ComputeSpec
-from .dist_spec_mgr import DistSpecManager
-from .distspec import ReplicaSpec, ShardSpec
 from .param_op_hook import ColoParamOpHook, ColoParamOpHookManager
-from .process_group import ProcessGroup
-from .tensor_spec import ColoTensorSpec
 from .utils import convert_dim_partition_dict, convert_parameter, merge_same_dim_mesh_list, named_params_with_colotensor
 
 __all__ = [
-    'ColoTensor', 'convert_parameter', 'ComputePattern', 'ComputeSpec', 'named_params_with_colotensor', 'ColoParameter',
-    'distspec', 'DistSpecManager', 'ColoParamOpHook', 'ColoParamOpHookManager', 'ProcessGroup', 'ColoTensorSpec',
-    'ShardSpec', 'ReplicaSpec', 'CommSpec', 'CollectiveCommPattern', 'convert_dim_partition_dict',
+    'ColoTensor', 'convert_parameter', 'named_params_with_colotensor', 'ColoParameter', 'ColoParamOpHook',
+    'ColoParamOpHookManager', 'CommSpec', 'CollectiveCommPattern', 'convert_dim_partition_dict',
     'merge_same_dim_mesh_list'
 ]
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 6f9717d35..5226f688b 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,79 +1,32 @@
-from .activation_checkpoint import checkpoint
-from .checkpointing import load_checkpoint, save_checkpoint
 from .common import (
     _cast_float,
-    clip_grad_norm_fp32,
     conditional_context,
-    copy_tensor_parallel_attributes,
-    count_zeros_fp32,
     disposable,
     ensure_path_exists,
     free_storage,
     is_ddp_ignored,
-    is_dp_rank_0,
-    is_model_parallel_parameter,
-    is_no_pp_or_last_stage,
-    is_tp_rank_0,
-    is_using_ddp,
-    is_using_pp,
-    is_using_sequence,
-    multi_tensor_applier,
-    param_is_not_tensor_parallel_duplicate,
-    print_rank_0,
-    switch_virtual_pipeline_parallel_rank,
-    sync_model_param,
-)
-from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
-from .data_sampler import DataParallelSampler, get_dataloader
-from .memory import (
-    colo_device_memory_capacity,
-    colo_device_memory_used,
-    colo_get_cpu_memory_capacity,
-    colo_set_cpu_memory_capacity,
-    colo_set_process_memory_fraction,
-    report_memory_usage,
+    set_seed,
 )
+from .cuda import empty_cache, get_current_device, set_device, set_to_cuda, synchronize
+from .multi_tensor_apply import multi_tensor_applier
 from .tensor_detector import TensorDetector
 from .timer import MultiTimer, Timer
 
 __all__ = [
-    'checkpoint',
-    'print_rank_0',
-    'sync_model_param',
-    'is_ddp_ignored',
-    'is_dp_rank_0',
-    'is_tp_rank_0',
-    'is_no_pp_or_last_stage',
-    'is_using_ddp',
-    'is_using_pp',
-    'is_using_sequence',
     'conditional_context',
-    'is_model_parallel_parameter',
-    'clip_grad_norm_fp32',
-    'count_zeros_fp32',
-    'copy_tensor_parallel_attributes',
-    'param_is_not_tensor_parallel_duplicate',
     'get_current_device',
     'synchronize',
     'empty_cache',
     'set_to_cuda',
-    'report_memory_usage',
-    'colo_device_memory_capacity',
-    'colo_device_memory_used',
-    'colo_set_process_memory_fraction',
     'Timer',
     'MultiTimer',
     'multi_tensor_applier',
-    'DataParallelSampler',
-    'get_dataloader',
-    'switch_virtual_pipeline_parallel_rank',
     'TensorDetector',
-    'load_checkpoint',
-    'save_checkpoint',
     'ensure_path_exists',
     'disposable',
-    'colo_set_cpu_memory_capacity',
-    'colo_get_cpu_memory_capacity',
     '_cast_float',
     'free_storage',
+    'set_seed',
+    'is_ddp_ignored',
+    'set_device',
 ]
diff --git a/colossalai/utils/checkpoint/__init__.py b/colossalai/utils/checkpoint/__init__.py
deleted file mode 100644
index 1795b4ce3..000000000
--- a/colossalai/utils/checkpoint/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .module_checkpoint import save_checkpoint, load_checkpoint
-
-__all__ = ['save_checkpoint', 'load_checkpoint']
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 998901708..8c769c5b1 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -3,44 +3,12 @@
 import functools
 import os
 import random
-import socket
-from collections import defaultdict
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable
 
+import numpy as np
 import torch
-import torch.distributed as dist
-from torch import inf
-from torch.nn.parameter import Parameter
-
-from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.tensor import ColoParameter, ProcessGroup
-
-from .multi_tensor_apply import multi_tensor_applier
-
-try:
-    from colossalai._C import fused_optim
-except:
-    fused_optim = None
-
-
-def print_rank_0(msg: str, logger=None):
-    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
-
-    Args:
-        msg (str): A string message to output.
-        logger (:class:`colossalai.logging.DistributedLogger`, optional):
-            The logger to record the message, defaults to None.
-    """
-    if gpc.get_global_rank() == 0:
-        if logger is None:
-            print(msg, flush=True)
-        else:
-            logger.info(msg)
 
 
 def ensure_path_exists(filename: str):
@@ -50,47 +18,6 @@ def ensure_path_exists(filename: str):
         Path(dirpath).mkdir(parents=True, exist_ok=True)
 
 
-def sync_model_param(model, parallel_mode):
-    r"""Make sure data parameters are consistent during Data Parallel Mode.
-
-    Args:
-        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
-
-    Note:
-        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
-        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
-    """
-    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
-        for param in model.parameters():
-            ranks = gpc.get_ranks_in_group(parallel_mode)
-            dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
-
-
-def is_dp_rank_0():
-    return not gpc.is_initialized(ParallelMode.DATA) or gpc.is_first_rank(ParallelMode.DATA)
-
-
-def is_tp_rank_0():
-    return not gpc.is_initialized(ParallelMode.TENSOR) or gpc.is_first_rank(ParallelMode.TENSOR)
-
-
-def is_no_pp_or_last_stage():
-    return not gpc.is_initialized(ParallelMode.PIPELINE) or gpc.is_last_rank(ParallelMode.PIPELINE)
-
-
-def is_using_ddp():
-    return gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1
-
-
-def is_using_pp():
-    return gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1
-
-
-def is_using_sequence():
-    return gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1
-
-
 @contextmanager
 def conditional_context(context_manager, enable=True):
     if enable:
@@ -100,365 +27,10 @@ def conditional_context(context_manager, enable=True):
         yield
 
 
-class model_branch_context(object):
-
-    def __enter__(self):
-        self.env_status = env.save()
-
-    def __exit__(self, *exc_info):
-        env.load(**self.env_status)
-
-
-def is_model_parallel_parameter(p):
-    return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
-
-
 def is_ddp_ignored(p):
     return getattr(p, '_ddp_to_ignore', False)
 
 
-def _calc_l2_norm(grads):
-    # we should not
-    global fused_optim
-
-    if fused_optim is None:
-        from colossalai.kernel.op_builder import FusedOptimBuilder
-        fused_optim = FusedOptimBuilder().load()
-
-    norm = 0.0
-    if len(grads) > 0:
-        dummy_overflow_buf = torch.cuda.IntTensor([0])
-        norm, _ = multi_tensor_applier(
-            fused_optim.multi_tensor_l2norm,
-            dummy_overflow_buf,
-            [grads],
-            False    # no per-parameter norm
-        )
-    return norm
-
-
-def _calc_lp(grads, norm_type):
-    norm = 0.0
-    for grad in grads:
-        grad_norm = torch.norm(grad, norm_type)
-        norm += grad_norm**norm_type
-    return norm
-
-
-def _move_norm_to_cuda(norm: Union[float, torch.Tensor]) -> Union[float, torch.Tensor]:
-    if torch.is_tensor(norm) and norm.device.type != 'cuda':
-        norm = norm.to(torch.cuda.current_device())
-    return norm
-
-
-def _get_tensor_norm(norm: Union[float, torch.Tensor], move_to_cuda) -> torch.Tensor:
-    if isinstance(norm, float):
-        norm = torch.Tensor([norm])
-    if move_to_cuda:
-        norm = norm.to(torch.cuda.current_device())
-    return norm
-
-
-# ======== Gradient Clipping =========
-
-
-def _compute_local_lp(params: List[ColoParameter], norm_type: float) -> float:
-    if len(params) == 0:
-        return 0.0
-    grads = [p.grad for p in params]
-    use_cuda_kernel = grads[0].device.type == 'cuda'
-    if norm_type == inf:
-        local_lp = max([g.abs().max() for g in grads])
-    elif norm_type == 2.0 and use_cuda_kernel:
-        local_lp = _calc_l2_norm(grads)**norm_type
-    else:
-        local_lp = _calc_lp(grads, norm_type)
-    if isinstance(local_lp, torch.Tensor):
-        return local_lp.item()
-    return local_lp
-
-
-def _compute_buckets_lp(params: List[ColoParameter], norm_type: float) -> float:
-    if len(params) == 0:
-        return 0.0
-    buckets: Dict[Optional[ProcessGroup], List[ColoParameter]] = defaultdict(list)
-    for p in params:
-        if p.is_replicate():
-            buckets[None].append(p)
-        else:
-            buckets[p.get_process_group().tp_process_group()].append(p)
-    total_lp = 0.0
-    for group, bucket in buckets.items():
-        local_lp = _compute_local_lp(bucket, norm_type)
-        if group is not None:
-            local_lp_tensor = torch.tensor([local_lp], device=torch.cuda.current_device())
-            if norm_type == inf:
-                dist.all_reduce(local_lp_tensor, op=dist.ReduceOp.MAX, group=group)
-            else:
-                dist.all_reduce(local_lp_tensor, group=group)
-            local_lp = local_lp_tensor.item()
-        if norm_type == inf:
-            total_lp = max(total_lp, local_lp)
-        else:
-            total_lp += local_lp
-    return total_lp
-
-
-def _compute_pp_grad_lp(total_lp: float, norm_type: float) -> float:
-    if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
-        total_lp_tensor = torch.tensor([total_lp], device=torch.cuda.current_device())
-        if norm_type == inf:
-            dist.all_reduce(total_lp_tensor, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.PIPELINE))
-        else:
-            dist.all_reduce(total_lp_tensor, group=gpc.get_group(ParallelMode.PIPELINE))
-        total_lp = total_lp_tensor.item()
-    return total_lp
-
-
-def _compute_grad_lp(parameters, norm_type: float = 2.0) -> float:
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    grad_dtype = None
-    cpu_grad_params: List[ColoParameter] = []
-    cuda_grad_params: List[ColoParameter] = []
-    for p in parameters:
-        if p.grad is None:
-            continue
-        assert isinstance(p, ColoParameter)
-        if grad_dtype is None:
-            grad_dtype = p.grad.dtype
-        assert p.grad.dtype == grad_dtype, f'Expected all grads are {grad_dtype}, got {p.grad.dtype}'
-        if p.grad.device.type == 'cuda':
-            cuda_grad_params.append(p)
-        else:
-            cpu_grad_params.append(p)
-    norm_type = float(norm_type)
-    cpu_lp = _compute_buckets_lp(cpu_grad_params, norm_type)
-    cuda_lp = _compute_buckets_lp(cuda_grad_params, norm_type)
-    if norm_type == inf:
-        total_lp = max(cpu_lp, cuda_lp)
-    else:
-        total_lp = cpu_lp + cuda_lp
-    return _compute_pp_grad_lp(total_lp, norm_type)
-
-
-def compute_grad_norm(parameters, norm_type: float = 2.0) -> float:
-    norm_type = float(norm_type)
-    total_norm = _compute_grad_lp(parameters, norm_type)
-    if norm_type != inf:
-        total_norm = total_norm**(1 / norm_type)
-    return total_norm
-
-
-def _clip_grad_norm(parameters, max_norm: float, total_norm: float) -> None:
-    clip_coef = max_norm / (total_norm + 1e-6)
-    if clip_coef < 1.0:
-        cuda_grads: List[torch.Tensor] = []
-        cpu_grads: List[torch.Tensor] = []
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        for p in parameters:
-            if p.grad is None:
-                continue
-            if p.grad.device.type == 'cuda':
-                cuda_grads.append(p.grad.detach())
-            else:
-                cpu_grads.append(p.grad.detach())
-        if len(cuda_grads) > 0:
-            dummy_overflow_buf = torch.cuda.IntTensor([0])
-            multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [cuda_grads, cuda_grads],
-                                 clip_coef)
-        for g in cpu_grads:
-            g.mul_(clip_coef)
-
-
-def clip_grad_norm(parameters, max_norm: float, norm_type: float = 2.0) -> float:
-    total_norm = compute_grad_norm(parameters, norm_type)
-    _clip_grad_norm(parameters, max_norm, total_norm)
-    return total_norm
-
-
-def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
-    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.
-
-    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
-    added functionality to handle model parallel parameters.
-
-    Note:
-        the gradients are modified in place.
-
-    Args:
-        parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
-            An iterable of Tensors or a single Tensor that will have gradients normalized.
-        max_norm (Union[float, int]): Max norm of the gradients.
-        norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
-
-    Returns:
-        float: Total norm of the parameters.
-    """
-
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-
-    # Filter parameters based on:
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
-    params: List[Parameter] = []
-    has_zero_shared_param: bool = False
-    for param in parameters:
-        if param.grad is not None:
-            # Make sure the grads are in fp32
-            assert param.grad.dtype == torch.float, \
-                f'expected gradient to be dtype torch.float, but got {param.grad.type()}'
-            if hasattr(param, 'colo_attr') and param.colo_attr.sharded_data_tensor.is_sharded:
-                has_zero_shared_param = True
-            params.append(param)
-
-    if len(params) == 0:
-        enable_cuda_kernels = False
-    else:
-        enable_cuda_kernels = params[0].grad.device.type == 'cuda'
-    # Norm parameters.
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-
-    # Parameters can be on CPU or CUDA
-    # If parameters are on CPU, disable CUDA kernels
-
-    # Calculate norm.
-    if norm_type == inf:
-        total_norm = max(p.grad.data.abs().max() for p in params)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all model-parallel GPUs.
-        if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=gpc.get_group(ParallelMode.MODEL),
-                            async_op=False)
-        if has_zero_shared_param:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=gpc.get_group(ParallelMode.DATA),
-                            async_op=False)
-        total_norm = total_norm_cuda[0].item()
-    else:
-        tensor_parallel_grads = []
-        no_tensor_parallel_grads = []
-        zero_sharded_grads = []
-        for p in params:
-            if is_model_parallel_parameter(p):
-                reductor = (gpc.get_world_size(ParallelMode.TENSOR) / getattr(p, NUM_PARTITIONS))**(1 / norm_type)
-                tensor_parallel_grads.append(p.grad.data / reductor)
-            elif hasattr(p, 'colo_attr') and p.colo_attr.sharded_data_tensor.is_sharded:
-                zero_sharded_grads.append(p.grad.data)
-            else:
-                no_tensor_parallel_grads.append(p.grad.data)
-
-        if norm_type == 2.0 and enable_cuda_kernels:
-            tensor_parallel_norm = _calc_l2_norm(tensor_parallel_grads)**norm_type
-            no_tensor_parallel_norm = _calc_l2_norm(no_tensor_parallel_grads)**norm_type
-            zero_sharded_norm = _calc_l2_norm(zero_sharded_grads)**norm_type
-        else:
-            tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type)
-            no_tensor_parallel_norm = _calc_lp(no_tensor_parallel_grads, norm_type)
-            zero_sharded_norm = _calc_lp(zero_sharded_grads, norm_type)
-        # If norm is type of float, then we convert them into torch.Tensor.
-        tensor_parallel_norm = _get_tensor_norm(tensor_parallel_norm, enable_cuda_kernels)
-        no_tensor_parallel_norm = _get_tensor_norm(no_tensor_parallel_norm, enable_cuda_kernels)
-        zero_sharded_norm = _get_tensor_norm(zero_sharded_norm, enable_cuda_kernels)
-        # If grads are on CPU, the norms is also on CPU. Cast them to CUDA tensors
-        if not enable_cuda_kernels:
-            tensor_parallel_norm = _move_norm_to_cuda(tensor_parallel_norm)
-            no_tensor_parallel_norm = _move_norm_to_cuda(no_tensor_parallel_norm)
-            zero_sharded_norm = _move_norm_to_cuda(zero_sharded_norm)
-
-        # Sum across all model-parallel GPUs.
-        if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0:
-            dist.all_reduce(tensor_parallel_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
-        # Sum across all zero sharded GPUs
-        if len(zero_sharded_grads) > 0:
-            dist.all_reduce(zero_sharded_norm, group=gpc.get_group(ParallelMode.DATA))
-            no_tensor_parallel_norm += zero_sharded_norm
-        total_norm = tensor_parallel_norm + no_tensor_parallel_norm
-        if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
-        total_norm = total_norm**(1.0 / norm_type)
-        if torch.is_tensor(total_norm):
-            total_norm = total_norm.item()
-
-    # Scale.
-    clip_coeff = max_norm / (total_norm + 1.0e-6)
-    if clip_coeff < 1.0:
-        if enable_cuda_kernels:
-            grads = [p.grad.detach() for p in params]
-            dummy_overflow_buf = torch.cuda.IntTensor([0])
-            multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff)
-        else:
-            for p in params:
-                p.grad.detach().mul_(clip_coeff)
-    return total_norm
-
-
-def count_zeros_fp32(parameters):
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-
-    # Filter parameters based on:
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
-    total_num_zeros = 0.0
-    for param in parameters:
-        grad_not_none = param.grad is not None
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        if grad_not_none and is_not_tp_duplicate:
-            grad = param.grad.detach()
-            num_zeros = grad.numel() - torch.count_nonzero(grad)
-            total_num_zeros = num_zeros + total_num_zeros
-
-    total_num_zeros = torch.IntTensor([int(total_num_zeros)]).cuda()
-
-    # Sum across all model-parallel GPUs.
-    ops = []
-    ops.append(
-        dist.all_reduce(total_num_zeros, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR), async_op=True))
-    if gpc.is_initialized(ParallelMode.PIPELINE):
-        ops.append(
-            dist.all_reduce(total_num_zeros,
-                            op=dist.ReduceOp.SUM,
-                            group=gpc.get_group(ParallelMode.PIPELINE),
-                            async_op=True))
-
-    for req in ops:
-        req.wait()
-    total_num_zeros = total_num_zeros.item()
-
-    return total_num_zeros
-
-
-def copy_tensor_parallel_attributes(src_tensor, dst_tensor):
-    for attr in TENSOR_PARALLEL_ATTRIBUTES:
-        if hasattr(src_tensor, attr):
-            val = getattr(src_tensor, attr)
-            setattr(dst_tensor, attr, val)
-
-
-def param_is_not_tensor_parallel_duplicate(param):
-    return (hasattr(param, IS_TENSOR_PARALLEL) and getattr(param, IS_TENSOR_PARALLEL)) or (gpc.get_local_rank(
-        ParallelMode.TENSOR) == 0)
-
-
-@contextmanager
-def switch_virtual_pipeline_parallel_rank(rank):
-    prev_rank = gpc.virtual_pipeline_parallel_rank
-    try:
-        gpc.set_virtual_pipeline_parallel_rank(rank)
-        yield
-    finally:
-        gpc.set_virtual_pipeline_parallel_rank(prev_rank)
-
-
 def disposable(func: Callable) -> Callable:
     executed = False
 
@@ -489,3 +61,9 @@ def _cast_float(args, dtype: torch.dtype):
     elif isinstance(args, dict):
         args = {k: _cast_float(v, dtype) for k, v in args.items()}
     return args
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
diff --git a/colossalai/utils/cuda.py b/colossalai/utils/cuda.py
index 60f3ccb60..6b5d17cf0 100644
--- a/colossalai/utils/cuda.py
+++ b/colossalai/utils/cuda.py
@@ -1,7 +1,10 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+from typing import Optional
+
 import torch
+import torch.distributed as dist
 
 
 def set_to_cuda(models):
@@ -23,7 +26,7 @@ def set_to_cuda(models):
 def get_current_device() -> torch.device:
     """
     Returns currently selected device (gpu/cpu).
-    If cuda available, return gpu, otherwise return cpu.    
+    If cuda available, return gpu, otherwise return cpu.
     """
     if torch.cuda.is_available():
         return torch.device(f'cuda:{torch.cuda.current_device()}')
@@ -45,3 +48,9 @@ def empty_cache():
     """
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+
+
+def set_device(index: Optional[int] = None) -> None:
+    if index is None:
+        index = dist.get_rank() % torch.cuda.device_count()
+    torch.cuda.set_device(index)
diff --git a/colossalai/utils/moe.py b/colossalai/utils/moe.py
index 86d04c119..6456dfb90 100644
--- a/colossalai/utils/moe.py
+++ b/colossalai/utils/moe.py
@@ -1,52 +1,54 @@
-import torch.nn as nn
-import torch.distributed as dist
-from colossalai.core import global_context as gpc
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.context import ParallelMode
-from .common import is_using_ddp
-from typing import Dict, List
-
-
-def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]:
-    """Returns a parameter dictionary, the key of which is the expert parallel
-    size of every parameter. Since the parameters in data parallelism is replicated
-    in each GPU, we set their ep_size to 1.
-
-    Args:
-        model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
-    """
-    epsize_param_dict = dict()
-    for param in model.parameters():
-        if not hasattr(param, 'moe_info'):
-            ep_size = 1    # set ep_size to 1 for dp parameters
-        else:
-            ep_size = param.moe_info.ep_size
-        if ep_size not in epsize_param_dict:
-            epsize_param_dict[ep_size] = []
-        epsize_param_dict[ep_size].append(param)
-
-    return epsize_param_dict
-
-
-def sync_moe_model_param(model: nn.Module):
-    """Make sure model parameters are consistent in MoE parallel context.
-
-    Args:
-        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
-    """
-    if is_using_ddp():
-
-        param_dict = get_moe_epsize_param_dict(model)
-
-        # synchronize the parameters whose dp_group is the whole world
-        if 1 in param_dict:
-            src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0]
-            for param in param_dict[1]:
-                dist.broadcast(param, src=src_rank, group=gpc.get_group(ParallelMode.DATA))
-
-        for ep_size in param_dict:
-            # When ep_size = world_size, communication is not needed
-            if ep_size != 1 and ep_size != MOE_CONTEXT.world_size:
-                src_rank = dist.get_rank(MOE_CONTEXT.parallel_info_dict[ep_size].ep_group)
-                for param in param_dict[ep_size]:
-                    dist.broadcast(param, src=src_rank, group=param.moe_info.dp_group)
+from typing import Dict, List
+
+import torch.distributed as dist
+import torch.nn as nn
+
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import is_using_ddp
+
+
+def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]:
+    """Returns a parameter dictionary, the key of which is the expert parallel
+    size of every parameter. Since the parameters in data parallelism is replicated
+    in each GPU, we set their ep_size to 1.
+
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
+    """
+    epsize_param_dict = dict()
+    for param in model.parameters():
+        if not hasattr(param, 'moe_info'):
+            ep_size = 1    # set ep_size to 1 for dp parameters
+        else:
+            ep_size = param.moe_info.ep_size
+        if ep_size not in epsize_param_dict:
+            epsize_param_dict[ep_size] = []
+        epsize_param_dict[ep_size].append(param)
+
+    return epsize_param_dict
+
+
+def sync_moe_model_param(model: nn.Module):
+    """Make sure model parameters are consistent in MoE parallel context.
+
+    Args:
+        model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
+    """
+    if is_using_ddp():
+
+        param_dict = get_moe_epsize_param_dict(model)
+
+        # synchronize the parameters whose dp_group is the whole world
+        if 1 in param_dict:
+            src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0]
+            for param in param_dict[1]:
+                dist.broadcast(param, src=src_rank, group=gpc.get_group(ParallelMode.DATA))
+
+        for ep_size in param_dict:
+            # When ep_size = world_size, communication is not needed
+            if ep_size != 1 and ep_size != MOE_CONTEXT.world_size:
+                src_rank = dist.get_rank(MOE_CONTEXT.parallel_info_dict[ep_size].ep_group)
+                for param in param_dict[ep_size]:
+                    dist.broadcast(param, src=src_rank, group=param.moe_info.dp_group)
diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py
index dad852a34..549635af4 100644
--- a/colossalai/zero/gemini/colo_init_context.py
+++ b/colossalai/zero/gemini/colo_init_context.py
@@ -3,7 +3,8 @@ from typing import Any, Dict, Iterator, Optional, Tuple, Union
 import torch
 from torch import nn
 
-from colossalai.tensor import ColoParameter, ColoTensor, ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup
+from colossalai.tensor import ColoParameter, ColoTensor
 from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
 
 # find named_params includes replica
diff --git a/colossalai/zero/gemini/memory_tracer/__init__.py b/colossalai/zero/gemini/memory_tracer/__init__.py
index 02c9d5754..e1fe904eb 100644
--- a/colossalai/zero/gemini/memory_tracer/__init__.py
+++ b/colossalai/zero/gemini/memory_tracer/__init__.py
@@ -3,9 +3,8 @@ from .memory_stats import MemStats    # isort:skip
 from .memory_monitor import AsyncMemoryMonitor, SyncCudaMemoryMonitor    # isort:skip
 from .memstats_collector import MemStatsCollector    # isort:skip
 from .chunk_memstats_collector import ChunkMemStatsCollector    # isort:skip
-from .static_memstats_collector import StaticMemStatsCollector    # isort:skip
 
 __all__ = [
-    'AsyncMemoryMonitor', 'SyncCudaMemoryMonitor', 'MemStatsCollector', 'ChunkMemStatsCollector',
-    'StaticMemStatsCollector', 'MemStats', 'OrderedParamGenerator'
+    'AsyncMemoryMonitor', 'SyncCudaMemoryMonitor', 'MemStatsCollector', 'ChunkMemStatsCollector', 'MemStats',
+    'OrderedParamGenerator'
 ]
diff --git a/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py b/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
index 83903bbf4..b93ad2c44 100644
--- a/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
+++ b/colossalai/zero/gemini/memory_tracer/chunk_memstats_collector.py
@@ -1,7 +1,6 @@
 from typing import Optional
 
 from colossalai.utils import get_current_device
-from colossalai.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.chunk import ChunkManager
 
 from .memory_stats import MemStats
@@ -33,4 +32,5 @@ class ChunkMemStatsCollector(MemStatsCollector):
 
     @property
     def cuda_margin_mem(self) -> float:
+        from colossalai.legacy.utils.memory import colo_device_memory_capacity
         return colo_device_memory_capacity(get_current_device()) - self._memstats.max_overall_cuda
diff --git a/colossalai/zero/gemini/memory_tracer/memory_monitor.py b/colossalai/zero/gemini/memory_tracer/memory_monitor.py
index 4bb585677..2a65d4b55 100644
--- a/colossalai/zero/gemini/memory_tracer/memory_monitor.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_monitor.py
@@ -5,7 +5,7 @@ from time import sleep, time
 
 import torch
 
-from colossalai.utils import colo_device_memory_used, get_current_device
+from colossalai.utils import get_current_device
 
 
 class MemoryMonitor:
@@ -110,6 +110,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
         return max_usage
 
     def _measure_usage(self):
+        from colossalai.legacy.utils import colo_device_memory_used
         max_usage = 0
         while self.keep_measuring:
             max_usage = max(
diff --git a/colossalai/zero/gemini/memory_tracer/memstats_collector.py b/colossalai/zero/gemini/memory_tracer/memstats_collector.py
index 0694be485..abb3dcc74 100644
--- a/colossalai/zero/gemini/memory_tracer/memstats_collector.py
+++ b/colossalai/zero/gemini/memory_tracer/memstats_collector.py
@@ -70,7 +70,7 @@ class MemStatsCollector:
         Sampling model data statistics.
         """
         if self._start_flag and not self.use_outside_memstats:
-            from colossalai.zero.legacy.gemini import StatefulTensor
+            from colossalai.legacy.zero.gemini import StatefulTensor
 
             # The following code work for ZeroInitContext, which is deprecated in v0.1.12
             cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']
diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
index e5466965c..6656821fe 100644
--- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
+++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,12 +1,12 @@
 import torch.nn
 
-from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import _cast_float
-from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import (
+from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
     GradMemStats,
     GradMemTracerHook,
     ParamMemTracerHook,
 )
+from colossalai.tensor.param_op_hook import ColoParamOpHookManager
+from colossalai.utils import _cast_float
 
 from .memory_stats import MemStats
 
diff --git a/colossalai/zero/gemini/placement_policy.py b/colossalai/zero/gemini/placement_policy.py
index cd775da5e..a35529723 100644
--- a/colossalai/zero/gemini/placement_policy.py
+++ b/colossalai/zero/gemini/placement_policy.py
@@ -6,8 +6,8 @@ from typing import Dict, List, Optional, Tuple, Type
 
 import torch
 
+from colossalai.legacy.utils.memory import colo_device_memory_capacity
 from colossalai.utils import get_current_device
-from colossalai.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.chunk import Chunk
 
 from .chunk import Chunk, ChunkManager
diff --git a/colossalai/zero/low_level/_utils.py b/colossalai/zero/low_level/_utils.py
index 4205a9891..ece92fe02 100644
--- a/colossalai/zero/low_level/_utils.py
+++ b/colossalai/zero/low_level/_utils.py
@@ -7,9 +7,6 @@ from torch import Tensor, inf
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed import ProcessGroup
 
-from colossalai.tensor import ColoParameter
-from colossalai.utils import is_model_parallel_parameter
-
 
 def flatten(input_):
     return _flatten_dense_tensors(input_)
diff --git a/docs/README.md b/docs/README.md
index f0cb50ffe..a5ae2ce96 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -108,5 +108,5 @@ We support `autodoc` to extract the docstring and transform it into a Web elemen
 You just need to add `{{ autodoc:<mod-name> }}` in your markdown as a single line. An example is given below and you can see the outcome in [this PR](https://github.com/hpcaitech/ColossalAI-Documentation/pull/175).
 
 ```markdown
-{{ autodoc:colossalai.amp.apex_amp.convert_to_apex_amp }}
+{{ autodoc:colossalai.legacy.amp.apex_amp.convert_to_apex_amp }}
 ```
diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md
index 384221596..63434a526 100644
--- a/docs/source/en/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/en/advanced_tutorials/add_your_parallel.md
@@ -31,7 +31,7 @@ global context for users to easily manage their process groups. If you wish to a
 define a new class and set it in your configuration file. To define your own way of creating process groups, you can
 follow the steps below to create a new distributed initialization.
 
-1. Add your parallel mode in `colossalai.context.parallel_mode.ParallelMode`.
+1. Add your parallel mode in `colossalai.legacy.context.parallel_mode.ParallelMode`.
     ```python
     class ParallelMode(Enum):
         GLOBAL = 'global'
diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 36c94fb49..0218264cc 100644
--- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -37,7 +37,7 @@ import torch.nn as nn
 from colossalai import nn as col_nn
 from colossalai.amp import AMP_TYPE
 from colossalai.legacy.builder.pipeline import partition_uniform
-from colossalai.context.parallel_mode import ParallelMode
+from colossalai.legacy.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
diff --git a/docs/source/en/basics/command_line_tool.md b/docs/source/en/basics/command_line_tool.md
index 48b199cf7..4c278aaa0 100644
--- a/docs/source/en/basics/command_line_tool.md
+++ b/docs/source/en/basics/command_line_tool.md
@@ -30,24 +30,4 @@ This command will inform you information regarding the version compatibility and
 To launch distributed jobs on single or multiple nodes, the command `colossalai run` can be used for process launching.
 You may refer to [Launch Colossal-AI](./launch_colossalai.md) for more details.
 
-## Tensor Parallel Micro-Benchmarking
-
-As Colossal-AI provides an array of tensor parallelism methods, it is not intuitive to choose one for your hardware and
-model. Therefore, we provide a simple benchmarking to evaluate the performance of various tensor parallelisms on your system.
-This benchmarking is run on a simple MLP model where the input data is of the shape `(batch_size, seq_length, hidden_size)`.
-Based on the number of GPUs, the CLI will look for all possible tensor parallel configurations and display the benchmarking results.
-You can customize the benchmarking configurations by checking out `colossalai benchmark --help`.
-
-```shell
-# run on 4 GPUs
-colossalai benchmark --gpus 4
-
-# run on 8 GPUs
-colossalai benchmark --gpus 8
-```
-
-:::caution
-
-Only single-node benchmarking is supported currently.
-
-:::
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
index c4b0f6557..812b9c34e 100644
--- a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
@@ -24,7 +24,7 @@
 并行通常由进程组来管理，参与相同并行算法的进程被置于同一进程组。对于不同的并行算法，需要创建不同的进程组。
 Colossal-AI 为用户提供了一个全局 context，使他们能够轻松地管理进程组。如果你想添加新的进程组，你可以很容易地定义一个新的类并在你的配置文件中设置它。为了定义你自己的进程组创建方式，你可以按照下面的步骤来创建一个新的分布式初始化。
 
-1. 在 `colossalai.context.parallel_mode.ParallelMode` 中添加你自己的并行模式。
+1. 在 `colossalai.legacy.context.parallel_mode.ParallelMode` 中添加你自己的并行模式。
     ```python
     class ParallelMode(Enum):
         GLOBAL = 'global'
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 3f57f39f2..a1d58e9fd 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -37,7 +37,7 @@ import torch.nn as nn
 from colossalai import nn as col_nn
 from colossalai.amp import AMP_TYPE
 from colossalai.legacy.builder.pipeline import partition_uniform
-from colossalai.context.parallel_mode import ParallelMode
+from colossalai.legacy.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
diff --git a/docs/source/zh-Hans/basics/command_line_tool.md b/docs/source/zh-Hans/basics/command_line_tool.md
index 9b0275a6c..5c4c18989 100644
--- a/docs/source/zh-Hans/basics/command_line_tool.md
+++ b/docs/source/zh-Hans/basics/command_line_tool.md
@@ -26,22 +26,4 @@ Colossal-AI给用户提供了命令行工具，目前命令行工具可以用来
 
 在分布式训练时，我们可以使用`colossalai run`来启动单节点或者多节点的多进程，详细的内容可以参考[启动 Colossal-AI](./launch_colossalai.md)。
 
-## 张量并行基准测试
-
-Colossal-AI提供了多种张量并行，想要充分理解这些方法需要一定的学习成本，对于新手来说很难靠经验选择一个并行方式。
-所以我们提供了一个简单的基准测试，能够让用户在自己的机器上测试不同张量并行的性能。这个基准测试跑一个并行的MLP模型，
-输入数据的维度为`（批大小，序列长度，隐藏层维度）`。通过指定GPU的数量，Colossal-AI会搜索所有可行的并行配置。用户可以通过查看`colossalai benchmark --help`来自定义相关的测试参数。
-
-```shell
-# 使用4个GPU
-colossalai benchmark --gpus 4
-
-# 使用8个GPU
-colossalai benchmark --gpus 8
-```
-
-:::caution
-
-目前仅支持单节点的基准测试。
-
-:::
+<!-- doc-test-command: echo  -->
diff --git a/examples/community/roberta/pretraining/pretrain_utils.py b/examples/community/roberta/pretraining/pretrain_utils.py
index cea6ac2c3..e6a393a57 100644
--- a/examples/community/roberta/pretraining/pretrain_utils.py
+++ b/examples/community/roberta/pretraining/pretrain_utils.py
@@ -16,7 +16,7 @@ from transformers import (
     get_linear_schedule_with_warmup,
 )
 
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 from colossalai.nn.lr_scheduler import LinearWarmupLR
 from colossalai.nn.optimizer import FusedAdam, HybridAdam
 
diff --git a/examples/community/roberta/pretraining/run_pretraining.py b/examples/community/roberta/pretraining/run_pretraining.py
index 53fa9f489..fa6457cab 100644
--- a/examples/community/roberta/pretraining/run_pretraining.py
+++ b/examples/community/roberta/pretraining/run_pretraining.py
@@ -17,7 +17,7 @@ from utils.logger import Logger
 
 import colossalai
 from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 from colossalai.nn.parallel import GeminiDDP, zero_model_wrapper, zero_optim_wrapper
 from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.utils import get_current_device
diff --git a/examples/community/roberta/pretraining/utils/exp_util.py b/examples/community/roberta/pretraining/utils/exp_util.py
index 4a2c9d8a4..1fcaa428b 100644
--- a/examples/community/roberta/pretraining/utils/exp_util.py
+++ b/examples/community/roberta/pretraining/utils/exp_util.py
@@ -5,7 +5,7 @@ import shutil
 import psutil
 import torch
 
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 
 
 def logging(s, log_path, print_=True, log_=True):
diff --git a/examples/images/dreambooth/test_ci.sh b/examples/images/dreambooth/test_ci.sh
index 84345f589..b0a96ec70 100644
--- a/examples/images/dreambooth/test_ci.sh
+++ b/examples/images/dreambooth/test_ci.sh
@@ -1,24 +1,26 @@
 #!/bin/bash
 set -xe
-pip install -r requirements.txt
+echo "this test is slow"
 
-HF_DATASETS_OFFLINE=1
-TRANSFORMERS_OFFLINE=1
-DIFFUSERS_OFFLINE=1
+# pip install -r requirements.txt
 
-#  "torch_ddp" "torch_ddp_fp16" "low_level_zero"
-for plugin in "gemini"; do
-  torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
-  --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
-  --instance_data_dir="/data/dreambooth/Teyvat/data" \
-  --output_dir="./weight_output" \
-  --instance_prompt="a picture of a dog" \
-  --resolution=512 \
-  --plugin=$plugin \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --test_run=True \
-  --num_class_images=200
-done
+# HF_DATASETS_OFFLINE=1
+# TRANSFORMERS_OFFLINE=1
+# DIFFUSERS_OFFLINE=1
+
+# #  "torch_ddp" "torch_ddp_fp16" "low_level_zero"
+# for plugin in "gemini"; do
+#   torchrun --nproc_per_node 4 --standalone train_dreambooth_colossalai.py \
+#   --pretrained_model_name_or_path="/data/dreambooth/diffuser/stable-diffusion-v1-4"  \
+#   --instance_data_dir="/data/dreambooth/Teyvat/data" \
+#   --output_dir="./weight_output" \
+#   --instance_prompt="a picture of a dog" \
+#   --resolution=512 \
+#   --plugin=$plugin \
+#   --train_batch_size=1 \
+#   --learning_rate=5e-6 \
+#   --lr_scheduler="constant" \
+#   --lr_warmup_steps=0 \
+#   --test_run=True \
+#   --num_class_images=200
+# don
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai.py b/examples/images/dreambooth/train_dreambooth_colossalai.py
index f60704650..9b2ed3b97 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Optional
 
 import torch
+import torch.distributed as dist
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
@@ -21,13 +22,9 @@ from transformers import AutoTokenizer, PretrainedConfig
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext
-from colossalai.zero.gemini import get_static_torch_model
 
 disable_existing_loggers()
 logger = get_dist_logger()
@@ -366,8 +363,8 @@ def main(args):
     else:
         colossalai.launch_from_torch(config={}, seed=args.seed)
 
-    local_rank = gpc.get_local_rank(ParallelMode.DATA)
-    world_size = gpc.get_world_size(ParallelMode.DATA)
+    local_rank = dist.get_rank()
+    world_size = dist.get_world_size()
 
     if args.with_prior_preservation:
         class_images_dir = Path(args.class_data_dir)
diff --git a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
index c98950fd7..654bce36c 100644
--- a/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
+++ b/examples/images/dreambooth/train_dreambooth_colossalai_lora.py
@@ -23,8 +23,8 @@ from transformers import AutoTokenizer, PretrainedConfig
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
diff --git a/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py b/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
index e331fc8fc..84b02633e 100644
--- a/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
+++ b/examples/language/gpt/experiments/auto_parallel/auto_parallel_with_gpt.py
@@ -7,8 +7,8 @@ import transformers
 from gpt_modules import GPT2LMHeadModel, GPTLMLoss
 
 from colossalai.auto_parallel.tensor_shard.initialize import autoparallelize
-from colossalai.core import global_context as gpc
 from colossalai.initialize import launch_from_torch
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 
 BATCH_SIZE = 16
diff --git a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
index ad69888b8..30d6aab4f 100644
--- a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
+++ b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
@@ -3,7 +3,6 @@ import time
 from functools import partial
 
 import torch
-from model_zoo import model_builder
 from torch import nn
 from tqdm import tqdm
 
@@ -14,11 +13,12 @@ from colossalai.fx.passes.adding_split_node_pass import (
     split_with_split_nodes_pass,
 )
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.legacy.pipeline.middleware.adaptor import get_fx_topology
+from colossalai.legacy.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
+from colossalai.legacy.pipeline.rpc.utils import rpc_run
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
-from colossalai.pipeline.rpc.utils import rpc_run
+from model_zoo import model_builder
 
 
 def parse_args():
diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh
index 57ce6ab64..5eaa4af4d 100644
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
@@ -9,11 +9,6 @@ export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
 export TRAIN_STEP=${TRAIN_STEP:-10}
 # export PYTHONPATH=$PWD:$PYTHONPATH
 
-if [ ${USE_SHARD_INIT} = "True" ]; then
-  USE_SHARD_INIT="--shardinit"
-else
-  USE_SHARD_INIT=""
-fi
 
 mkdir -p gemini_logs
 
@@ -22,4 +17,4 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
 --batch_size=${BATCH_SIZE} \
 --distplan=${DISTPLAN} \
 --train_step=${TRAIN_STEP} \
-2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
+2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}.log
diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py
index 347251ca5..f9d30fd15 100644
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 from contextlib import nullcontext
 from functools import partial
@@ -9,7 +10,6 @@ import torch.nn as nn
 from commons.model_zoo import model_builder
 from commons.utils import get_data, get_profile_context, get_tflops, get_time_stamp
 from packaging import version
-from torch.nn.parallel import DistributedDataParallel as DDP
 
 import colossalai
 from colossalai.booster import Booster
@@ -23,7 +23,7 @@ CAI_VERSION = colossalai.__version__
 
 
 def parse_args():
-    parser = colossalai.get_default_parser()
+    parser = argparse.ArgumentParser()
     parser.add_argument(
         "--distplan",
         type=str,
diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh
index b9e4e43a8..db742220d 100644
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
@@ -2,4 +2,4 @@ set -x
 pip install -r requirements.txt
 
 cd gemini && bash test_ci.sh
-cd ../hybridparallelism && bash run.sh
+# cd ../hybridparallelism && bash run.sh
diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py
index e521193a9..a6c80394c 100644
--- a/examples/language/gpt/titans/model/embed.py
+++ b/examples/language/gpt/titans/model/embed.py
@@ -6,8 +6,8 @@ from torch import nn as nn
 from torch.nn import functional as F
 from torch.nn.parameter import Parameter
 
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input
 from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row
diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py
index 72297c540..746acbf7d 100644
--- a/examples/language/gpt/titans/model/gpt1d.py
+++ b/examples/language/gpt/titans/model/gpt1d.py
@@ -9,13 +9,13 @@ from torch import nn as nn
 
 from colossalai import kernel
 from colossalai import nn as col_nn
-from colossalai.core import global_context as gpc
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row
 from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.nn.layer.utils import ACT2FN, divide
+from colossalai.legacy.utils.activation_checkpoint import checkpoint
 from colossalai.utils import checkpoint
-from colossalai.utils.activation_checkpoint import checkpoint
 
 __all__ = [
     'GPTMLP1D', 'GPTSelfAttention1D', 'GPTTransformerLayer1D', 'FusedGPTSelfAttention1D', 'FusedGPTTransformerLayer1D'
diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py
index 9b22d156b..a9da246fa 100644
--- a/examples/language/gpt/titans/model/pipeline_gpt1d.py
+++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py
@@ -7,11 +7,11 @@ import torch.nn as nn
 
 from colossalai import kernel
 from colossalai import nn as col_nn
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.pipeline.utils import partition_uniform
 from colossalai.logging import get_dist_logger
-from colossalai.pipeline.utils import partition_uniform
 
 from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D
 from .gpt1d import FusedGPTTransformerLayer1D, GPTTransformerLayer1D
diff --git a/examples/language/gpt/titans/train_gpt.py b/examples/language/gpt/titans/train_gpt.py
index b239b626c..3ed18b21f 100644
--- a/examples/language/gpt/titans/train_gpt.py
+++ b/examples/language/gpt/titans/train_gpt.py
@@ -8,14 +8,14 @@ from titans.model.gpt import GPTLMLoss
 
 import colossalai
 import colossalai.utils as utils
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.trainer import Trainer, hooks
+from colossalai.legacy.zero.init_ctx import ZeroInitContext
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn import LinearWarmupLR
 from colossalai.utils import colo_set_process_memory_fraction, is_using_pp
 from colossalai.utils.timer import MultiTimer
-from colossalai.zero.legacy.init_ctx import ZeroInitContext
 
 
 def calc_local_model_size(model: torch.nn.Module):
diff --git a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
index a6a9ad0a3..33aa5990f 100644
--- a/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
+++ b/examples/tutorial/auto_parallel/auto_parallel_with_resnet.py
@@ -4,8 +4,8 @@ from tqdm import tqdm
 
 import colossalai
 from colossalai.auto_parallel.tensor_shard.initialize import initialize_model
-from colossalai.core import global_context as gpc
 from colossalai.device.device_mesh import DeviceMesh
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingLR
 
diff --git a/examples/tutorial/auto_parallel/test_ci.sh b/examples/tutorial/auto_parallel/test_ci.sh
index bf6275b67..b27e36217 100644
--- a/examples/tutorial/auto_parallel/test_ci.sh
+++ b/examples/tutorial/auto_parallel/test_ci.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -euxo pipefail
 
-pip install -r requirements.txt
-conda install -c conda-forge coin-or-cbc
-colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py
+echo "this test is outdated"
+
+# pip install -r requirements.txt
+# conda install -c conda-forge coin-or-cbc
+# colossalai run --nproc_per_node 4 auto_parallel_with_resnet.py
diff --git a/examples/tutorial/hybrid_parallel/config.py b/examples/tutorial/hybrid_parallel/config.py
index fe9abf2f1..287f62aa7 100644
--- a/examples/tutorial/hybrid_parallel/config.py
+++ b/examples/tutorial/hybrid_parallel/config.py
@@ -1,4 +1,4 @@
-from colossalai.amp import AMP_TYPE
+from colossalai.legacy.amp import AMP_TYPE
 
 # hyperparameters
 # BATCH_SIZE is as per GPU
diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py
index 12cdec902..21a568168 100644
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@@ -5,12 +5,12 @@ from titans.model.vit.vit import _create_vit_model
 from tqdm import tqdm
 
 import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn import CrossEntropyLoss
+from colossalai.legacy.pipeline.pipelinable import PipelinableContext
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.pipeline.pipelinable import PipelinableContext
 from colossalai.utils import is_using_pp
 
 
diff --git a/examples/tutorial/large_batch_optimizer/config.py b/examples/tutorial/large_batch_optimizer/config.py
index 2efa0ffd0..c6d9f9450 100644
--- a/examples/tutorial/large_batch_optimizer/config.py
+++ b/examples/tutorial/large_batch_optimizer/config.py
@@ -1,4 +1,4 @@
-from colossalai.amp import AMP_TYPE
+from colossalai.legacy.amp import AMP_TYPE
 
 # hyperparameters
 # BATCH_SIZE is as per GPU
diff --git a/examples/tutorial/large_batch_optimizer/test_ci.sh b/examples/tutorial/large_batch_optimizer/test_ci.sh
index 89f426c54..f43939382 100644
--- a/examples/tutorial/large_batch_optimizer/test_ci.sh
+++ b/examples/tutorial/large_batch_optimizer/test_ci.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
 set -euxo pipefail
+echo "this test is outdated"
 
-pip install -r requirements.txt
+# pip install -r requirements.txt
 
 # run test
-colossalai run --nproc_per_node 4 --master_port 29500 train.py --config config.py --optimizer lars
-colossalai run --nproc_per_node 4 --master_port 29501 train.py --config config.py --optimizer lamb
+# colossalai run --nproc_per_node 4 --master_port 29500 train.py --config config.py --optimizer lars
+# colossalai run --nproc_per_node 4 --master_port 29501 train.py --config config.py --optimizer lamb
diff --git a/examples/tutorial/large_batch_optimizer/train.py b/examples/tutorial/large_batch_optimizer/train.py
index 35e54582f..6ebd8d680 100644
--- a/examples/tutorial/large_batch_optimizer/train.py
+++ b/examples/tutorial/large_batch_optimizer/train.py
@@ -4,7 +4,7 @@ from torchvision.models import resnet18
 from tqdm import tqdm
 
 import colossalai
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import Lamb, Lars
diff --git a/examples/tutorial/opt/opt/colossalai_zero.py b/examples/tutorial/opt/opt/colossalai_zero.py
index 7c2c15245..8fbed6e83 100644
--- a/examples/tutorial/opt/opt/colossalai_zero.py
+++ b/examples/tutorial/opt/opt/colossalai_zero.py
@@ -2,7 +2,7 @@ try:
     from colossalai.zero.shard_utils import TensorShardStrategy
 except ImportError:
     # colossalai > 0.2.8
-    from colossalai.zero.legacy import TensorShardStrategy
+    from colossalai.legacy.zero import TensorShardStrategy
 
 zero = dict(model_config=dict(shard_strategy=TensorShardStrategy(),
                               tensor_placement_policy="auto",
diff --git a/examples/tutorial/opt/opt/context.py b/examples/tutorial/opt/opt/context.py
index 95f0abf1d..dfcd3b382 100644
--- a/examples/tutorial/opt/opt/context.py
+++ b/examples/tutorial/opt/opt/context.py
@@ -1,7 +1,7 @@
 import torch.distributed as dist
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 
 class barrier_context():
diff --git a/examples/tutorial/opt/opt/run_clm.py b/examples/tutorial/opt/opt/run_clm.py
index 91380e243..8cbf3d2a2 100755
--- a/examples/tutorial/opt/opt/run_clm.py
+++ b/examples/tutorial/opt/opt/run_clm.py
@@ -51,12 +51,13 @@ from transformers import (
 from transformers.utils.versions import require_version
 
 import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.tensor import ProcessGroup
+from colossalai.legacy.utils import get_dataloader
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.tensor import ProcessGroup
-from colossalai.utils import get_current_device, get_dataloader
+from colossalai.utils import get_current_device
 from colossalai.zero import GeminiOptimizer
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/tutorial/opt/opt/test_ci.sh b/examples/tutorial/opt/opt/test_ci.sh
index 431b37c12..9cbc49c7b 100755
--- a/examples/tutorial/opt/opt/test_ci.sh
+++ b/examples/tutorial/opt/opt/test_ci.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
 
 set -xue
+echo "this test is outdated"
+# pip install -r requirements.txt
 
-pip install -r requirements.txt
+# BS=4
+# MEMCAP=0
+# GPUNUM=4
+# MODLE="facebook/opt-125m"
 
-BS=4
-MEMCAP=0
-GPUNUM=4
-MODLE="facebook/opt-125m"
-
-torchrun \
-  --nproc_per_node ${GPUNUM} \
-  --master_port 19198 \
-  run_clm.py \
-  -s \
-  --output_dir $PWD \
-  --mem_cap ${MEMCAP} \
-  --model_name_or_path ${MODLE} \
-  --per_device_train_batch_size ${BS} \
-  --num_train_epochs 1
+# torchrun \
+#   --nproc_per_node ${GPUNUM} \
+#   --master_port 19198 \
+#   run_clm.py \
+#   -s \
+#   --output_dir $PWD \
+#   --mem_cap ${MEMCAP} \
+#   --model_name_or_path ${MODLE} \
+#   --per_device_train_batch_size ${BS} \
+#   --num_train_epochs 1
diff --git a/examples/tutorial/sequence_parallel/config.py b/examples/tutorial/sequence_parallel/config.py
index 6edf9cc2c..887de7164 100644
--- a/examples/tutorial/sequence_parallel/config.py
+++ b/examples/tutorial/sequence_parallel/config.py
@@ -1,4 +1,4 @@
-from colossalai.amp import AMP_TYPE
+from colossalai.legacy.amp import AMP_TYPE
 
 # hyper-parameters
 TRAIN_ITERS = 10
diff --git a/examples/tutorial/sequence_parallel/data/__init__.py b/examples/tutorial/sequence_parallel/data/__init__.py
index 1ef2d9993..6fdf07ba5 100644
--- a/examples/tutorial/sequence_parallel/data/__init__.py
+++ b/examples/tutorial/sequence_parallel/data/__init__.py
@@ -1,10 +1,12 @@
-from colossalai.context.parallel_context import ParallelContext
-from colossalai.core import global_context as gpc
+import torch
+
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.context.parallel_context import ParallelContext
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.context import ParallelMode
-from .datasets.data_samplers import build_pretraining_data_loader
+
 from .datasets.builder import build_train_valid_test_datasets
-import torch
+from .datasets.data_samplers import build_pretraining_data_loader
 
 
 def cyclic_iter(iter):
@@ -18,8 +20,7 @@ def build_train_valid_test_data_iterators(train_iters,
                                           eval_interval,
                                           eval_iters,
                                           dataloader_type='single',
-                                          **kwargs
-                                          ):
+                                          **kwargs):
     (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
 
     logger = get_dist_logger()
@@ -42,9 +43,7 @@ def build_train_valid_test_data_iterators(train_iters,
         train_samples = train_iters * global_batch_size
         eval_iters_ = (train_iters // eval_interval + 1) * eval_iters
         test_iters = eval_iters
-        train_val_test_num_samples = [train_samples,
-                                      eval_iters_ * global_batch_size,
-                                      test_iters * global_batch_size]
+        train_val_test_num_samples = [train_samples, eval_iters_ * global_batch_size, test_iters * global_batch_size]
         logger.info(' > datasets target sizes (minimum size):')
         logger.info('    train:      {}'.format(train_val_test_num_samples[0]), ranks=[0])
         logger.info('    validation: {}'.format(train_val_test_num_samples[1]), ranks=[0])
@@ -56,19 +55,20 @@ def build_train_valid_test_data_iterators(train_iters,
 
         # Build dataloaders.
         dp_size = gpc.get_world_size(ParallelMode.DATA)
-        train_dataloader = build_pretraining_data_loader(
-            train_ds, consumed_samples=0, micro_batch_size=global_batch_size//dp_size)
-        valid_dataloader = build_pretraining_data_loader(
-            valid_ds, consumed_samples=0, micro_batch_size=global_batch_size//dp_size)
-        test_dataloader = build_pretraining_data_loader(test_ds, 0, micro_batch_size=global_batch_size//dp_size)
+        train_dataloader = build_pretraining_data_loader(train_ds,
+                                                         consumed_samples=0,
+                                                         micro_batch_size=global_batch_size // dp_size)
+        valid_dataloader = build_pretraining_data_loader(valid_ds,
+                                                         consumed_samples=0,
+                                                         micro_batch_size=global_batch_size // dp_size)
+        test_dataloader = build_pretraining_data_loader(test_ds, 0, micro_batch_size=global_batch_size // dp_size)
 
         # Flags to know if we need to do training/validation/testing.
         do_train = train_dataloader is not None and train_iters > 0
         do_valid = valid_dataloader is not None and eval_iters > 0
         do_test = test_dataloader is not None and eval_iters > 0
         # Need to broadcast num_tokens and num_type_tokens.
-        flags = torch.cuda.LongTensor(
-            [int(do_train), int(do_valid), int(do_test)])
+        flags = torch.cuda.LongTensor([int(do_train), int(do_valid), int(do_test)])
     else:
         flags = torch.cuda.LongTensor([0, 0, 0])
 
diff --git a/examples/tutorial/sequence_parallel/data/bert_helper.py b/examples/tutorial/sequence_parallel/data/bert_helper.py
index d092db3e7..b65ca1e64 100644
--- a/examples/tutorial/sequence_parallel/data/bert_helper.py
+++ b/examples/tutorial/sequence_parallel/data/bert_helper.py
@@ -1,7 +1,8 @@
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
 import torch
 
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+
 _MAX_DATA_DIM = 5
 
 
@@ -22,7 +23,8 @@ def _build_key_size_numel_dictionaries(keys, data):
 
     # Move to GPU and broadcast.
     sizes_cuda = torch.cuda.LongTensor(sizes)
-    torch.distributed.broadcast(sizes_cuda, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0],
+    torch.distributed.broadcast(sizes_cuda,
+                                gpc.get_ranks_in_group(ParallelMode.TENSOR)[0],
                                 group=gpc.get_group(ParallelMode.TENSOR))
 
     # Move back to cpu and unpack.
@@ -60,19 +62,15 @@ def broadcast_data(keys, data, datatype):
     """
     # Build (key, size) and (key, number of elements) dictionaries along
     # with the total number of elements on all ranks.
-    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys,
-                                                                          data)
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
 
     # Pack on rank zero.
     if not gpc.is_initialized(ParallelMode.TENSOR) or gpc.get_local_rank(ParallelMode.TENSOR) == 0:
         # Check that all keys have the same data type.
         # Flatten the data associated with the keys
-        flatten_data = torch.cat(
-            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+        flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
     else:
-        flatten_data = torch.empty(total_numel,
-                                   device=torch.cuda.current_device(),
-                                   dtype=datatype)
+        flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
 
     # Broadcast
     torch.distributed.broadcast(flatten_data,
@@ -139,7 +137,7 @@ def get_batch_for_sequence_parallel(data_iterator):
     seq_length = data_b['text'].size(1)
     sub_seq_length = seq_length // local_world_size
     sub_seq_start = local_rank * sub_seq_length
-    sub_seq_end = (local_rank+1) * sub_seq_length
+    sub_seq_end = (local_rank + 1) * sub_seq_length
     #
     # # Unpack.
     tokens = data_b['text'][:, sub_seq_start:sub_seq_end].long()
@@ -156,10 +154,9 @@ class SequenceParallelDataIterator:
 
     def __init__(self, data_iter):
         self.data_iter = data_iter
-    
 
     def __iter__(self):
         return self.data_iter
 
     def __next__(self):
-        return get_batch_for_sequence_parallel(self.data_iter)
\ No newline at end of file
+        return get_batch_for_sequence_parallel(self.data_iter)
diff --git a/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py b/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py
index d6388bd9f..70c126912 100644
--- a/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py
+++ b/examples/tutorial/sequence_parallel/data/datasets/bert_dataset.py
@@ -21,8 +21,8 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 
 from ..tokenizer import get_tokenizer
diff --git a/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py b/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py
index cf547ad97..b9c197c95 100644
--- a/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py
+++ b/examples/tutorial/sequence_parallel/data/datasets/data_samplers.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 """Dataloaders."""
 
-import torch
 import random
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
+
+import torch
+
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 
 def build_pretraining_data_loader(dataset, consumed_samples, micro_batch_size, dataloader_type='single', num_workers=0):
diff --git a/examples/tutorial/sequence_parallel/data/tokenizer/tokenizer.py b/examples/tutorial/sequence_parallel/data/tokenizer/tokenizer.py
index ee3c923e8..ba832b5cd 100644
--- a/examples/tutorial/sequence_parallel/data/tokenizer/tokenizer.py
+++ b/examples/tutorial/sequence_parallel/data/tokenizer/tokenizer.py
@@ -12,13 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Megatron tokenizers."""
 
-from abc import ABC
-from abc import abstractmethod
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
+from abc import ABC, abstractmethod
+
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
 
@@ -26,18 +25,13 @@ from .bert_tokenization import FullTokenizer as FullBertTokenizer
 def build_tokenizer(vocab_file, tokenizer_type, vocab_extra_ids=0):
     """Initialize tokenizer."""
     if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:
-        print('> building {} tokenizer ...'.format(tokenizer_type),
-              flush=True)
+        print('> building {} tokenizer ...'.format(tokenizer_type), flush=True)
 
     # Select and instantiate the tokenizer.
     if tokenizer_type == 'BertWordPieceLowerCase':
-        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file,
-                                            lower_case=True,
-                                            vocab_extra_ids=vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file, lower_case=True, vocab_extra_ids=vocab_extra_ids)
     elif tokenizer_type == 'BertWordPieceCase':
-        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file,
-                                            lower_case=False,
-                                            vocab_extra_ids=vocab_extra_ids)
+        tokenizer = _BertWordPieceTokenizer(vocab_file=vocab_file, lower_case=False, vocab_extra_ids=vocab_extra_ids)
     else:
         raise NotImplementedError('{} tokenizer is not '
                                   'implemented.'.format(tokenizer_type))
@@ -62,8 +56,8 @@ def _vocab_size_with_padding(orig_vocab_size, make_vocab_size_divisible_by=128):
         after += 1
     if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:
         print(' > padded vocab (size: {}) with {} dummy tokens '
-              '(new size: {})'.format(
-                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+              '(new size: {})'.format(orig_vocab_size, after - orig_vocab_size, after),
+              flush=True)
     return after
 
 
@@ -142,8 +136,7 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
         self._additional_special_tokens = []
 
         # (dsachan) Add BOS and EOS tokens
-        SPECIAL_TOKENS = {'eos_token': '[EOS]',
-                          'bos_token': '[BOS]'}
+        SPECIAL_TOKENS = {'eos_token': '[EOS]', 'bos_token': '[BOS]'}
         self._bos_token = '[BOS]'
         self.add_token(self._bos_token)
         self._bos_token_id = self.vocab.get(self._bos_token)
@@ -155,8 +148,7 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
         # (dsachan) Add additional special tokens
         # These can be used as sentinel tokens in T5 model inputs
         additional_special_tokens = []
-        additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+        additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
         self.add_additional_special_tokens(additional_special_tokens)
 
     def add_token(self, token):
diff --git a/examples/tutorial/sequence_parallel/loss_func/bert_loss.py b/examples/tutorial/sequence_parallel/loss_func/bert_loss.py
index e87a778cf..b3f2487a4 100644
--- a/examples/tutorial/sequence_parallel/loss_func/bert_loss.py
+++ b/examples/tutorial/sequence_parallel/loss_func/bert_loss.py
@@ -1,37 +1,29 @@
 import torch
+import torch.distributed as dist
 import torch.nn as nn
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
-from colossalai.logging import get_dist_logger
 import torch.nn.functional as F
-import torch.distributed as dist
+
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+
 from .cross_entropy import vocab_cross_entropy
 
 
 class BertLoss(nn.Module):
 
-    def forward(self,
-                lm_loss,
-                sop_logits,
-                loss_mask,
-                sentence_order):
+    def forward(self, lm_loss, sop_logits, loss_mask, sentence_order):
         lm_loss_ = lm_loss.float()
         loss_mask = loss_mask.float()
         loss_mask_sum = loss_mask.sum()
-        lm_loss = torch.sum(
-            lm_loss_.view(-1) * loss_mask.reshape(-1))
+        lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1))
 
         lm_loss /= loss_mask_sum
 
-        torch.distributed.all_reduce(
-            lm_loss,
-            group=gpc.get_group(ParallelMode.SEQUENCE)
-        )
+        torch.distributed.all_reduce(lm_loss, group=gpc.get_group(ParallelMode.SEQUENCE))
 
         if sop_logits is not None:
-            sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(),
-                                       sentence_order.view(-1),
-                                       ignore_index=-1)
+            sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(), sentence_order.view(-1), ignore_index=-1)
             sop_loss = sop_loss.float()
             loss = lm_loss + sop_loss * gpc.get_world_size(ParallelMode.SEQUENCE)
         else:
diff --git a/examples/tutorial/sequence_parallel/loss_func/cross_entropy.py b/examples/tutorial/sequence_parallel/loss_func/cross_entropy.py
index 54553c29a..ed15c6ea8 100644
--- a/examples/tutorial/sequence_parallel/loss_func/cross_entropy.py
+++ b/examples/tutorial/sequence_parallel/loss_func/cross_entropy.py
@@ -1,7 +1,8 @@
-from colossalai.context.parallel_mode import ParallelMode
 import torch
 from torch.cuda.amp import custom_bwd, custom_fwd
 
+from colossalai.legacy.context.parallel_mode import ParallelMode
+
 
 class _VocabCrossEntropy(torch.autograd.Function):
 
@@ -24,8 +25,7 @@ class _VocabCrossEntropy(torch.autograd.Function):
         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
         logits_2d = vocab_parallel_logits.view(-1, vocab_parallel_logits.size(-1))
         masked_target_1d = masked_target.view(-1)
-        arange_1d = torch.arange(start=0, end=logits_2d.size()[0],
-                                 device=logits_2d.device)
+        arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
         predicted_logits = predicted_logits_1d.view_as(target)
@@ -58,10 +58,8 @@ class _VocabCrossEntropy(torch.autograd.Function):
         grad_2d = grad_input.view(-1, partition_vocab_size)
 
         # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
-                                 device=grad_2d.device)
-        grad_2d[arange_1d, masked_target_1d] -= (
-            1.0 - target_mask.view(-1).float())
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= (1.0 - target_mask.view(-1).float())
 
         # Finally elementwise multiplication with the output gradients.
         grad_input.mul_(grad_output.unsqueeze(dim=-1))
diff --git a/examples/tutorial/sequence_parallel/model/bert.py b/examples/tutorial/sequence_parallel/model/bert.py
index b8adb501f..4ba64bbe2 100644
--- a/examples/tutorial/sequence_parallel/model/bert.py
+++ b/examples/tutorial/sequence_parallel/model/bert.py
@@ -3,13 +3,13 @@ import inspect
 import torch
 import torch.nn as nn
 
-from colossalai.context import ParallelMode
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper
+from colossalai.legacy.pipeline.utils import partition_uniform
 from colossalai.logging import get_dist_logger
-from colossalai.pipeline.utils import partition_uniform
 
 from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding
 from .layers.init_method import init_normal, output_init_normal
diff --git a/examples/tutorial/sequence_parallel/model/layers/head.py b/examples/tutorial/sequence_parallel/model/layers/head.py
index ea336b9d1..9e25157e1 100644
--- a/examples/tutorial/sequence_parallel/model/layers/head.py
+++ b/examples/tutorial/sequence_parallel/model/layers/head.py
@@ -1,15 +1,17 @@
-import colossalai
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .pooler import Pooler
-from .linear import Linear
-from .embedding import VocabEmbedding
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
-from colossalai.kernel import LayerNorm
 from loss_func.cross_entropy import vocab_cross_entropy
 
+import colossalai
+from colossalai.kernel import LayerNorm
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+
+from .embedding import VocabEmbedding
+from .linear import Linear
+from .pooler import Pooler
+
 
 class BertLMHead(nn.Module):
     """Masked LM head for Bert
@@ -19,10 +21,11 @@ class BertLMHead(nn.Module):
         layernorm_epsilon: tolerance for layer norm divisions
     """
 
-    def __init__(self,
-                 vocab_size,
-                 hidden_size,
-                 ):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+    ):
 
         super(BertLMHead, self).__init__()
         self.bias = torch.nn.Parameter(torch.zeros(vocab_size))
diff --git a/examples/tutorial/sequence_parallel/model/layers/preprocess.py b/examples/tutorial/sequence_parallel/model/layers/preprocess.py
index 53a326dda..dd66bfe13 100644
--- a/examples/tutorial/sequence_parallel/model/layers/preprocess.py
+++ b/examples/tutorial/sequence_parallel/model/layers/preprocess.py
@@ -1,7 +1,8 @@
-from colossalai.context.parallel_mode import ParallelMode
 import torch
 import torch.nn as nn
-from colossalai.core import global_context as gpc
+
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 
 
 class PreProcessor(nn.Module):
@@ -14,8 +15,8 @@ class PreProcessor(nn.Module):
         # Create position ids
         seq_length = token_ids.size(1)
         local_rank = gpc.get_local_rank(ParallelMode.SEQUENCE)
-        position_ids = torch.arange(seq_length*local_rank,
-                                    seq_length * (local_rank+1),
+        position_ids = torch.arange(seq_length * local_rank,
+                                    seq_length * (local_rank + 1),
                                     dtype=torch.long,
                                     device=token_ids.device)
         position_ids = position_ids.unsqueeze(0).expand_as(token_ids)
diff --git a/examples/tutorial/sequence_parallel/test_ci.sh b/examples/tutorial/sequence_parallel/test_ci.sh
index 7bc20de3b..1cd646526 100644
--- a/examples/tutorial/sequence_parallel/test_ci.sh
+++ b/examples/tutorial/sequence_parallel/test_ci.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 set -euxo pipefail
 
-pip install -r requirements.txt
+echo "this test is outdated"
+# pip install -r requirements.txt
 
 # run test
-colossalai run --nproc_per_node 4 train.py
+# colossalai run --nproc_per_node 4 train.py
diff --git a/examples/tutorial/sequence_parallel/train.py b/examples/tutorial/sequence_parallel/train.py
index 86c4edeb5..b8b89cda5 100644
--- a/examples/tutorial/sequence_parallel/train.py
+++ b/examples/tutorial/sequence_parallel/train.py
@@ -8,14 +8,15 @@ from lr_scheduler import AnnealingLR
 from model.bert import BertForPretrain, build_pipeline_bert
 
 import colossalai
-from colossalai.amp import AMP_TYPE
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.amp import AMP_TYPE
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.engine.schedule import PipelineSchedule
+from colossalai.legacy.utils import is_using_pp
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import FusedAdam
-from colossalai.utils import MultiTimer, is_using_pp
+from colossalai.utils import MultiTimer
 
 
 def process_batch_data(batch_data):
diff --git a/tests/components_to_test/resnet.py b/tests/components_to_test/resnet.py
index 193832ebc..df01e4c48 100644
--- a/tests/components_to_test/resnet.py
+++ b/tests/components_to_test/resnet.py
@@ -1,11 +1,14 @@
-from torchvision.models import resnet18
-from .registry import non_distributed_component_funcs
-from pathlib import Path
 import os
+from pathlib import Path
+
 import torch
-from torchvision.transforms import transforms
 from torchvision.datasets import CIFAR10
-from colossalai.utils import get_dataloader
+from torchvision.models import resnet18
+from torchvision.transforms import transforms
+
+from colossalai.legacy.utils import get_dataloader
+
+from .registry import non_distributed_component_funcs
 
 
 def get_cifar10_dataloader(train):
diff --git a/tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py b/tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py
index f184f64b3..b65e6d0d8 100644
--- a/tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py
+++ b/tests/test_auto_parallel/test_ckpt_solvers/test_C_solver_consistency.py
@@ -6,12 +6,12 @@ import torch.fx
 import torchvision.models as tm
 
 import colossalai
-from colossalai.core import global_context as gpc
 from colossalai.fx import ColoGraphModule, ColoTracer
 from colossalai.fx._compatibility import is_compatible_with_meta
 # from colossalai.fx.passes.algorithms import solver_rotor
 # from colossalai.fx.passes.algorithms.operation import Sequence
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 if is_compatible_with_meta():
diff --git a/tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py b/tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py
index db268b91d..babdddfad 100644
--- a/tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py
+++ b/tests/test_auto_parallel/test_ckpt_solvers/test_ckpt_torchvision.py
@@ -8,12 +8,12 @@ import torchvision.models as tm
 from torch.fx import GraphModule
 
 import colossalai
-from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx._compatibility import is_compatible_with_meta
 from colossalai.fx.graph_module import ColoGraphModule
 # from colossalai.fx.passes.algorithms import chen_greedy, solver_rotor
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 if is_compatible_with_meta():
diff --git a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
index 4e3c26c1b..715f62358 100644
--- a/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_compatibility_with_gemini.py
@@ -13,10 +13,9 @@ from colossalai.device.device_mesh import DeviceMesh
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.tensor.process_group import ProcessGroup
 from colossalai.testing import assert_close, rerun_if_address_is_in_use, run_on_environment_flag, spawn
 from colossalai.utils import get_current_device
-from colossalai.zero import post_process_colo_init_ctx, zero_model_wrapper, zero_optim_wrapper
+from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 
 
 class MLP(torch.nn.Module):
@@ -70,14 +69,12 @@ def check_auto_parallel_with_gemini(rank, world_size, port):
             print(strategy)
         print('=' * msg_length)
 
-    dp_process_group = ProcessGroup(rank=rank, ranks=[0, 1, 2, 3], tp_degree=2, dp_degree=2)
     gemini_config = dict(strict_ddp_mode=False,
                          device=get_current_device(),
                          placement_policy='cpu',
                          pin_memory=True,
                          search_range_m=128)
 
-    post_process_colo_init_ctx(gm, device=get_current_device(), default_pg=dp_process_group)
     gm = zero_model_wrapper(gm, zero_stage=3, gemini_config=gemini_config)
     optimizer = HybridAdam(gm.parameters(), betas=(0, 0))
     optimizer = zero_optim_wrapper(gm, optimizer, initial_scale=1)
diff --git a/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py
index 15610e2b5..593658fd1 100644
--- a/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py
+++ b/tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py
@@ -6,9 +6,9 @@ import torch.fx
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
 from colossalai.autochunk.utils import flat_list
-from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import free_port
 
 if AUTOCHUNK_AVAILABLE:
diff --git a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
index b6a792f56..264331a5f 100644
--- a/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
+++ b/tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
@@ -5,9 +5,9 @@ import torch.fx
 
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
-from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.legacy.core import global_context as gpc
 
 if AUTOCHUNK_AVAILABLE:
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
diff --git a/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py b/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py
index 3202318fb..65d1e9c4d 100644
--- a/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py
+++ b/tests/test_autochunk/test_autochunk_vit/test_autochunk_vit_utils.py
@@ -5,9 +5,9 @@ import torch.fx
 
 import colossalai
 from colossalai.autochunk.autochunk_codegen import AUTOCHUNK_AVAILABLE
-from colossalai.core import global_context as gpc
 from colossalai.fx.graph_module import ColoGraphModule
 from colossalai.fx.passes.meta_info_prop import MetaInfoProp
+from colossalai.legacy.core import global_context as gpc
 
 if AUTOCHUNK_AVAILABLE:
     from colossalai.autochunk.autochunk_codegen import AutoChunkCodeGen
diff --git a/tests/test_cluster/test_process_group_mesh.py b/tests/test_cluster/test_process_group_mesh.py
index 13b711942..2304203d1 100644
--- a/tests/test_cluster/test_process_group_mesh.py
+++ b/tests/test_cluster/test_process_group_mesh.py
@@ -7,8 +7,8 @@ from colossalai.testing import spawn
 
 
 def check_process_group_mesh_with_gpc():
-    from colossalai.context import ParallelMode
-    from colossalai.core import global_context as gpc
+    from colossalai.legacy.context import ParallelMode
+    from colossalai.legacy.core import global_context as gpc
 
     DP_DIM, PP_DIM, TP_DIM = 0, 1, 2
     pg_mesh = ProcessGroupMesh(1, 2, 2)
@@ -138,7 +138,7 @@ def run_dist(rank, world_size, port):
                       port=port,
                       host='localhost')
     # TODO(ver217): this function should be removed when gpc is removed
-    check_process_group_mesh_with_gpc()
+    # check_process_group_mesh_with_gpc()
     check_process_group_mesh_with_cases()
 
 
diff --git a/tests/test_context/configs/parallel_2d_init.py b/tests/test_context/configs/parallel_2d_init.py
deleted file mode 100644
index 6af884450..000000000
--- a/tests/test_context/configs/parallel_2d_init.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-parallel = dict(
-    pipeline=dict(size=2),
-    tensor=dict(
-        size=4,
-        mode='2d'
-    )
-)
diff --git a/tests/test_context/configs/parallel_2p5d_init.py b/tests/test_context/configs/parallel_2p5d_init.py
deleted file mode 100644
index c2d896d38..000000000
--- a/tests/test_context/configs/parallel_2p5d_init.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-parallel = dict(
-    pipeline=dict(size=2),
-    tensor=dict(
-        size=8,
-        depth=2,
-        mode='2.5d'
-    )
-)
diff --git a/tests/test_context/configs/parallel_3d_init.py b/tests/test_context/configs/parallel_3d_init.py
deleted file mode 100644
index 0ec724f8b..000000000
--- a/tests/test_context/configs/parallel_3d_init.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-parallel = dict(
-    pipeline=dict(size=2),
-    tensor=dict(
-        size=8,
-        mode='3d'
-    )
-)
diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py
index 7c6339eff..c18bf5675 100644
--- a/tests/test_device/test_init_logical_pg.py
+++ b/tests/test_device/test_init_logical_pg.py
@@ -3,7 +3,6 @@ import torch
 import torch.distributed as dist
 from torch.distributed import ReduceOp
 
-from colossalai.core import global_context as gpc
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.initialize import launch
 from colossalai.testing import rerun_if_address_is_in_use, spawn
@@ -13,7 +12,7 @@ def check_layer(rank, world_size, port):
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     physical_mesh_id = torch.arange(0, 4)
-    assert rank == gpc.get_global_rank()
+    assert rank == dist.get_rank()
 
     tensor_to_check = torch.tensor([2, 2, 2, 2]).cuda()
     mesh_shape = (2, 2)
@@ -27,8 +26,6 @@ def check_layer(rank, world_size, port):
         dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg)
         assert tensor.equal(tensor_to_check)
 
-    gpc.destroy()
-
 
 @pytest.mark.dist
 @rerun_if_address_is_in_use()
diff --git a/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py b/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py
index bcac2ec42..6a12f5bc8 100644
--- a/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py
+++ b/tests/test_fx/test_codegen/test_activation_checkpoint_codegen.py
@@ -4,9 +4,9 @@ import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 
 import colossalai
-from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 try:
diff --git a/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py b/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py
index 5b327807a..ebcfb4d7b 100644
--- a/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py
+++ b/tests/test_fx/test_codegen/test_nested_activation_checkpoint_codegen.py
@@ -2,9 +2,9 @@ import pytest
 import torch
 
 import colossalai
-from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 try:
diff --git a/tests/test_fx/test_codegen/test_offload_codegen.py b/tests/test_fx/test_codegen/test_offload_codegen.py
index c217b9658..dac59c236 100644
--- a/tests/test_fx/test_codegen/test_offload_codegen.py
+++ b/tests/test_fx/test_codegen/test_offload_codegen.py
@@ -5,9 +5,9 @@ import torch
 from torch.fx import GraphModule
 
 import colossalai
-from colossalai.core import global_context as gpc
 from colossalai.fx import ColoTracer
 from colossalai.fx.graph_module import ColoGraphModule
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 try:
diff --git a/tests/test_fx/test_parallel_1d.py b/tests/test_fx/test_parallel_1d.py
index 1044be7db..29135b45f 100644
--- a/tests/test_fx/test_parallel_1d.py
+++ b/tests/test_fx/test_parallel_1d.py
@@ -5,9 +5,9 @@ import pytest
 import torch
 from torch.fx import symbolic_trace
 
-from colossalai.core import global_context as gpc
 from colossalai.fx.passes import column_shard_linear_pass
 from colossalai.initialize import launch
+from colossalai.legacy.core import global_context as gpc
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_fx/test_pipeline/test_topo/topo_utils.py b/tests/test_fx/test_pipeline/test_topo/topo_utils.py
index 55dd65201..db6cadfc5 100644
--- a/tests/test_fx/test_pipeline/test_topo/topo_utils.py
+++ b/tests/test_fx/test_pipeline/test_topo/topo_utils.py
@@ -1,18 +1,22 @@
+import random
+
+import numpy as np
 import torch
 from torch.fx import GraphModule
-from colossalai.fx.passes.adding_split_node_pass import split_with_split_nodes_pass, balanced_split_pass
+
 from colossalai.fx import ColoTracer
-from colossalai.pipeline.middleware import Partition, PartitionInputVal, PartitionOutputVal, Topo
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-import random
-import numpy as np
+from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
+from colossalai.legacy.pipeline.middleware import Partition, PartitionInputVal, PartitionOutputVal, Topo
+from colossalai.legacy.pipeline.middleware.adaptor import get_fx_topology
 
 MANUAL_SEED = 0
 random.seed(MANUAL_SEED)
 np.random.seed(MANUAL_SEED)
 torch.manual_seed(MANUAL_SEED)
 
+
 class MLP(torch.nn.Module):
+
     def __init__(self, config={}):
         super().__init__()
         dim = config['dim']
@@ -27,6 +31,7 @@ class MLP(torch.nn.Module):
             x = layer(x)
         return x
 
+
 def split_model_and_get_DAG(model, data_gen):
     model.eval()
 
@@ -46,7 +51,7 @@ def split_model_and_get_DAG(model, data_gen):
     # apply transform passes
     annotated_model = balanced_split_pass(gm, 2)
     top_module, split_submodules = split_with_split_nodes_pass(annotated_model)
-    
+
     topo = get_fx_topology(top_module)
     for submodule in split_submodules:
         if isinstance(submodule, torch.fx.GraphModule):
@@ -54,6 +59,7 @@ def split_model_and_get_DAG(model, data_gen):
 
     return top_module, split_submodules[0]._topo
 
+
 def check_input(top_module, input_partition: Partition):
     partition_output = input_partition.get_output_vals()
     arg_pos = 0
@@ -63,13 +69,14 @@ def check_input(top_module, input_partition: Partition):
             to_partition_and_offset = cur_checkee.get()
             assert len(to_partition_and_offset) == len(node.users.keys())
             arg_pos += 1
-        
+
     assert arg_pos == len(partition_output)
-        
+
+
 def check_submod(top_module, part_id, mid_partition: Partition):
     partition_input = mid_partition.get_input_vals()
     partition_output = mid_partition.get_output_vals()
-    
+
     cnt = 1
     cur_node = None
     for node in top_module.graph.nodes:
@@ -78,15 +85,15 @@ def check_submod(top_module, part_id, mid_partition: Partition):
         if cnt == part_id:
             cur_node = node
             break
-    
+
     assert len(partition_input) == len(cur_node.args)
     assert len(partition_output) == len(cur_node.users)
 
-def check_topo(top_module, topo: Topo):    
+
+def check_topo(top_module, topo: Topo):
     input_partition = topo.get_input_partition()
     mid_partitions = topo.get_mid_partitions()
-    
+
     check_input(top_module, input_partition)
     for part_id, submod in mid_partitions.items():
         check_submod(top_module, part_id, submod)
-            
\ No newline at end of file
diff --git a/tests/test_amp/test_naive_fp16.py b/tests/test_legacy/test_amp/test_naive_fp16.py
similarity index 94%
rename from tests/test_amp/test_naive_fp16.py
rename to tests/test_legacy/test_amp/test_naive_fp16.py
index 6ce4c7f49..54bf64985 100644
--- a/tests/test_amp/test_naive_fp16.py
+++ b/tests/test_legacy/test_amp/test_naive_fp16.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp, convert_to_naive_amp
+from colossalai.legacy.amp import convert_to_apex_amp, convert_to_naive_amp
 from colossalai.testing import assert_close_loose, clear_cache_before_run, rerun_if_address_is_in_use, spawn
 from tests.components_to_test.registry import non_distributed_component_funcs
 
@@ -78,7 +78,7 @@ def run_naive_amp():
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    colossalai.legacy.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
     run_naive_amp()
 
 
diff --git a/tests/test_amp/test_torch_fp16.py b/tests/test_legacy/test_amp/test_torch_fp16.py
similarity index 95%
rename from tests/test_amp/test_torch_fp16.py
rename to tests/test_legacy/test_amp/test_torch_fp16.py
index 6451aa626..89810b5d0 100644
--- a/tests/test_amp/test_torch_fp16.py
+++ b/tests/test_legacy/test_amp/test_torch_fp16.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp, convert_to_torch_amp
+from colossalai.legacy.amp import convert_to_apex_amp, convert_to_torch_amp
 from colossalai.testing import assert_close_loose, clear_cache_before_run, rerun_if_address_is_in_use, spawn
 from tests.components_to_test.registry import non_distributed_component_funcs
 
@@ -78,7 +78,7 @@ def run_torch_amp():
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    colossalai.legacy.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
     run_torch_amp()
 
 
diff --git a/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py b/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py
index c5fb049fe..4851b3e36 100644
--- a/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py
+++ b/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py
@@ -1,10 +1,10 @@
 import pytest
 import torch
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.legacy.communication.p2p_v2 import _recv_object, _send_object
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_legacy/test_comm/test_comm.py b/tests/test_legacy/test_comm/test_comm.py
index 3251d8d46..fccfcd973 100644
--- a/tests/test_legacy/test_comm/test_comm.py
+++ b/tests/test_legacy/test_comm/test_comm.py
@@ -2,10 +2,10 @@ import pytest
 import torch
 import torch.distributed as dist
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.legacy.communication import all_gather, all_reduce, reduce_scatter
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
 
diff --git a/tests/test_legacy/test_comm/test_object_list_p2p.py b/tests/test_legacy/test_comm/test_object_list_p2p.py
index f50982ee1..a1322e6f2 100644
--- a/tests/test_legacy/test_comm/test_object_list_p2p.py
+++ b/tests/test_legacy/test_comm/test_object_list_p2p.py
@@ -1,9 +1,6 @@
 import pytest
 import torch
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.legacy.communication.p2p import (
     recv_backward,
     recv_forward,
@@ -12,6 +9,9 @@ from colossalai.legacy.communication.p2p import (
     send_forward,
     send_forward_recv_backward,
 )
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 CONFIG = dict(parallel=dict(pipeline=2))
diff --git a/tests/test_legacy/test_comm/test_object_list_p2p_v2.py b/tests/test_legacy/test_comm/test_object_list_p2p_v2.py
index 040c63322..f805bd19d 100644
--- a/tests/test_legacy/test_comm/test_object_list_p2p_v2.py
+++ b/tests/test_legacy/test_comm/test_object_list_p2p_v2.py
@@ -1,10 +1,10 @@
 import pytest
 import torch
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.legacy.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_legacy/test_context/configs/parallel_2d_init.py b/tests/test_legacy/test_context/configs/parallel_2d_init.py
new file mode 100644
index 000000000..6cf816942
--- /dev/null
+++ b/tests/test_legacy/test_context/configs/parallel_2d_init.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+parallel = dict(pipeline=dict(size=2), tensor=dict(size=4, mode='2d'))
diff --git a/tests/test_legacy/test_context/configs/parallel_2p5d_init.py b/tests/test_legacy/test_context/configs/parallel_2p5d_init.py
new file mode 100644
index 000000000..b946d45b3
--- /dev/null
+++ b/tests/test_legacy/test_context/configs/parallel_2p5d_init.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+parallel = dict(pipeline=dict(size=2), tensor=dict(size=8, depth=2, mode='2.5d'))
diff --git a/tests/test_legacy/test_context/configs/parallel_3d_init.py b/tests/test_legacy/test_context/configs/parallel_3d_init.py
new file mode 100644
index 000000000..a1564bbb2
--- /dev/null
+++ b/tests/test_legacy/test_context/configs/parallel_3d_init.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+parallel = dict(pipeline=dict(size=2), tensor=dict(size=8, mode='3d'))
diff --git a/tests/test_context/test_hybrid_parallel.py b/tests/test_legacy/test_context/test_hybrid_parallel.py
similarity index 95%
rename from tests/test_context/test_hybrid_parallel.py
rename to tests/test_legacy/test_context/test_hybrid_parallel.py
index d25668afd..05cd1d294 100644
--- a/tests/test_context/test_hybrid_parallel.py
+++ b/tests/test_legacy/test_context/test_hybrid_parallel.py
@@ -6,11 +6,11 @@ from pathlib import Path
 import pytest
 import torch
 
-from colossalai import launch
-from colossalai.context import reset_seeds
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as tp_env
+from colossalai.legacy import launch
+from colossalai.legacy.context import reset_seeds
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as tp_env
 from colossalai.testing import free_port, rerun_if_address_is_in_use, spawn
 
 CONFIG_PATH_LIST = list(Path(__file__).parent.glob('configs/*.py'))
diff --git a/tests/test_data/test_cifar10_dataset.py b/tests/test_legacy/test_data/test_cifar10_dataset.py
similarity index 100%
rename from tests/test_data/test_cifar10_dataset.py
rename to tests/test_legacy/test_data/test_cifar10_dataset.py
diff --git a/tests/test_data/test_data_parallel_sampler.py b/tests/test_legacy/test_data/test_data_parallel_sampler.py
similarity index 87%
rename from tests/test_data/test_data_parallel_sampler.py
rename to tests/test_legacy/test_data/test_data_parallel_sampler.py
index 7beef707c..cf10fe9df 100644
--- a/tests/test_data/test_data_parallel_sampler.py
+++ b/tests/test_legacy/test_data/test_data_parallel_sampler.py
@@ -10,10 +10,11 @@ import torch.distributed as dist
 from torchvision import datasets, transforms
 
 import colossalai
-from colossalai.context import Config, ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.context import Config
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import get_dataloader
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_dataloader
 
 CONFIG = Config(dict(
     parallel=dict(
@@ -26,7 +27,7 @@ CONFIG = Config(dict(
 
 def run_data_sampler(rank, world_size, port):
     dist_args = dict(config=CONFIG, rank=rank, world_size=world_size, backend='gloo', port=port, host='localhost')
-    colossalai.launch(**dist_args)
+    colossalai.legacy.launch(**dist_args)
     print('finished initialization')
 
     # build dataset
diff --git a/tests/test_legacy/test_data/test_deterministic_dataloader.py b/tests/test_legacy/test_data/test_deterministic_dataloader.py
new file mode 100644
index 000000000..421b8d255
--- /dev/null
+++ b/tests/test_legacy/test_data/test_deterministic_dataloader.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+import torch.distributed as dist
+from torchvision import datasets, transforms
+
+import colossalai
+from colossalai.context import Config
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import get_dataloader
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+CONFIG = Config(
+    dict(
+        train_data=dict(
+            dataset=dict(
+                type='CIFAR10',
+                root=Path(os.environ['DATA']),
+                train=True,
+                download=True,
+            ),
+            dataloader=dict(num_workers=2, batch_size=2, shuffle=True),
+        ),
+        parallel=dict(
+            pipeline=dict(size=1),
+            tensor=dict(size=1, mode=None),
+        ),
+        seed=1024,
+    ))
+
+
+def run_data_sampler(rank, world_size, port):
+    dist_args = dict(config=CONFIG, rank=rank, world_size=world_size, backend='gloo', port=port, host='localhost')
+    colossalai.legacy.launch(**dist_args)
+
+    # build dataset
+    transform_pipeline = [transforms.ToTensor(), transforms.RandomCrop(size=32, padding=4)]
+    transform_pipeline = transforms.Compose(transform_pipeline)
+    dataset = datasets.CIFAR10(root=Path(os.environ['DATA']), train=True, download=True, transform=transform_pipeline)
+
+    # build dataloader
+    dataloader = get_dataloader(dataset, batch_size=8, add_sampler=False)
+
+    data_iter = iter(dataloader)
+    img, label = data_iter.next()
+    img = img[0]
+
+    if gpc.get_local_rank(ParallelMode.DATA) != 0:
+        img_to_compare = img.clone()
+    else:
+        img_to_compare = img
+    dist.broadcast(img_to_compare, src=0, group=gpc.get_group(ParallelMode.DATA))
+
+    if gpc.get_local_rank(ParallelMode.DATA) != 0:
+        # this is without sampler
+        # this should be false if data parallel sampler to given to the dataloader
+        assert torch.equal(img,
+                           img_to_compare), 'Same image was distributed across ranks and expected it to be the same'
+    torch.cuda.empty_cache()
+
+
+@rerun_if_address_is_in_use()
+def test_data_sampler():
+    spawn(run_data_sampler, 4)
+
+
+if __name__ == '__main__':
+    test_data_sampler()
diff --git a/tests/test_legacy/test_engine/test_engine.py b/tests/test_legacy/test_engine/test_engine.py
index 62493cf37..849978403 100644
--- a/tests/test_legacy/test_engine/test_engine.py
+++ b/tests/test_legacy/test_engine/test_engine.py
@@ -1,8 +1,8 @@
 import pytest
 
 import colossalai
-from colossalai.amp import AMP_TYPE
-from colossalai.core import global_context as gpc
+from colossalai.legacy.amp import AMP_TYPE
+from colossalai.legacy.core import global_context as gpc
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from tests.components_to_test.registry import non_distributed_component_funcs
 
@@ -20,10 +20,11 @@ def run_train(model_name, amp_mode):
     model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
 
     model = model_builder(checkpoint=False)
-    engine, train_dataloader, *args = colossalai.initialize(model=model,
-                                                            optimizer=optimizer_class(model.parameters(), lr=1e-3),
-                                                            criterion=criterion,
-                                                            train_dataloader=train_dataloader)
+    engine, train_dataloader, *args = colossalai.legacy.initialize(model=model,
+                                                                   optimizer=optimizer_class(model.parameters(),
+                                                                                             lr=1e-3),
+                                                                   criterion=criterion,
+                                                                   train_dataloader=train_dataloader)
 
     try:
         engine.train()
@@ -48,7 +49,12 @@ def run_train(model_name, amp_mode):
 
 def run_engine(rank, world_size, port):
     # init dist env
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config=CONFIG,
+                             rank=rank,
+                             world_size=world_size,
+                             host='localhost',
+                             port=port,
+                             backend='nccl')
     run_train()
 
 
diff --git a/tests/test_legacy/test_engine/test_gradient_accumluation.py b/tests/test_legacy/test_engine/test_gradient_accumluation.py
index 7783827c7..168c93c1a 100644
--- a/tests/test_legacy/test_engine/test_gradient_accumluation.py
+++ b/tests/test_legacy/test_engine/test_gradient_accumluation.py
@@ -10,10 +10,10 @@ from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 
 import colossalai
-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import get_dataloader
 from colossalai.logging import get_dist_logger
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_dataloader
 
 # Config
 BATCH_SIZE = 2
@@ -27,7 +27,12 @@ CONFIG = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=1, mode=None
 def run_no_pipeline(rank, world_size, port):
 
     # init dist env
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config=CONFIG,
+                             rank=rank,
+                             world_size=world_size,
+                             host='localhost',
+                             port=port,
+                             backend='nccl')
 
     # build model
     model = resnet18(num_classes=10)
@@ -49,10 +54,10 @@ def run_no_pipeline(rank, world_size, port):
     optimizer = Adam(model.parameters(), lr=0.001)
     criterion = nn.CrossEntropyLoss()
 
-    engine, train_dataloader, *args = colossalai.initialize(model=model,
-                                                            optimizer=optimizer,
-                                                            criterion=criterion,
-                                                            train_dataloader=train_dataloader)
+    engine, train_dataloader, *args = colossalai.legacy.initialize(model=model,
+                                                                   optimizer=optimizer,
+                                                                   criterion=criterion,
+                                                                   train_dataloader=train_dataloader)
     logger = get_dist_logger()
     rank = torch.distributed.get_rank()
     param_track = []
diff --git a/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
index dcb2be626..859707e61 100644
--- a/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
+++ b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py
@@ -2,9 +2,9 @@ import torch
 import torch.distributed as dist
 from torch.nn import Parameter
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn import (
     Classifier1D,
     Embedding1D,
@@ -15,7 +15,8 @@ from colossalai.legacy.nn import (
     VocabParallelCrossEntropyLoss1D,
     VocabParallelEmbedding1D,
 )
-from colossalai.utils import get_current_device, print_rank_0
+from colossalai.legacy.utils import print_rank_0
+from colossalai.utils import get_current_device
 
 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
 
diff --git a/tests/test_legacy/test_layers/test_1d/test_1d.py b/tests/test_legacy/test_layers/test_1d/test_1d.py
index 891512542..2a016ed7b 100644
--- a/tests/test_legacy/test_layers/test_1d/test_1d.py
+++ b/tests/test_legacy/test_layers/test_1d/test_1d.py
@@ -5,8 +5,8 @@ import pytest
 import torch
 from checks_1d.check_layer_1d import *
 
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
index 0ee88c260..494497be3 100644
--- a/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py
@@ -1,7 +1,7 @@
 import torch
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn import (
     Classifier2D,
     CrossEntropyLoss2D,
@@ -15,7 +15,8 @@ from colossalai.legacy.nn import (
     VocabParallelCrossEntropyLoss2D,
     VocabParallelEmbedding2D,
 )
-from colossalai.utils import get_current_device, print_rank_0
+from colossalai.legacy.utils import print_rank_0
+from colossalai.utils import get_current_device
 
 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
 
diff --git a/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
index ae1d1120c..034dbe5ca 100644
--- a/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py
@@ -3,10 +3,11 @@
 
 import torch
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D
-from colossalai.utils import get_current_device, print_rank_0
+from colossalai.legacy.utils import print_rank_0
+from colossalai.utils import get_current_device
 
 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal
 
diff --git a/tests/test_legacy/test_layers/test_2d/test_2d.py b/tests/test_legacy/test_layers/test_2d/test_2d.py
index bcea5ce7b..a4b46793f 100644
--- a/tests/test_legacy/test_layers/test_2d/test_2d.py
+++ b/tests/test_legacy/test_layers/test_2d/test_2d.py
@@ -18,8 +18,8 @@ from checks_2d.check_layer_2d import (
 )
 from checks_2d.check_operation_2d import check_AB, check_ABT, check_ATB
 
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
index 5a99b05cf..e7a9a8be4 100644
--- a/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py
@@ -1,8 +1,8 @@
 import torch
 from torch.nn import Parameter
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn import (
     Classifier2p5D,
     CrossEntropyLoss2p5D,
@@ -16,7 +16,8 @@ from colossalai.legacy.nn import (
     VocabParallelCrossEntropyLoss2p5D,
     VocabParallelEmbedding2p5D,
 )
-from colossalai.utils import get_current_device, print_rank_0
+from colossalai.legacy.utils import print_rank_0
+from colossalai.utils import get_current_device
 
 from .common import *
 
diff --git a/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
index db1996767..fe78ef669 100644
--- a/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py
@@ -1,9 +1,10 @@
 import torch
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D
-from colossalai.utils import get_current_device, print_rank_0
+from colossalai.legacy.utils import print_rank_0
+from colossalai.utils import get_current_device
 
 from .common import *
 
diff --git a/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
index 373d834d0..38ba3ba78 100644
--- a/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
+++ b/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py
@@ -3,8 +3,8 @@ import torch
 from checks_2p5d.check_layer_2p5d import *
 from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
 
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
diff --git a/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
index cee639a9f..2a9dcc3cd 100644
--- a/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
+++ b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py
@@ -5,8 +5,8 @@ import time
 
 import torch
 
-from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.core import global_context
+from colossalai.legacy.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.core import global_context
 from colossalai.legacy.nn import (
     Classifier3D,
     CrossEntropyLoss3D,
@@ -21,8 +21,9 @@ from colossalai.legacy.nn import (
     VocabParallelEmbedding3D,
 )
 from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.legacy.utils import print_rank_0
 from colossalai.logging import get_dist_logger
-from colossalai.utils import get_current_device, print_rank_0
+from colossalai.utils import get_current_device
 
 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal
 
diff --git a/tests/test_legacy/test_layers/test_3d/test_3d.py b/tests/test_legacy/test_layers/test_3d/test_3d.py
index fde71a4a0..2a32d8935 100644
--- a/tests/test_legacy/test_layers/test_3d/test_3d.py
+++ b/tests/test_legacy/test_layers/test_3d/test_3d.py
@@ -15,8 +15,8 @@ from checks_3d.check_layer_3d import (
     check_vocab_parallel_loss,
 )
 
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
 
diff --git a/tests/test_legacy/test_layers/test_cache_embedding.py b/tests/test_legacy/test_layers/test_cache_embedding.py
index 0760a3f1e..c58445a39 100644
--- a/tests/test_legacy/test_layers/test_cache_embedding.py
+++ b/tests/test_legacy/test_layers/test_cache_embedding.py
@@ -14,7 +14,8 @@ from colossalai.legacy.nn.parallel.layers import (
     ParallelCachedEmbeddingBagTablewise,
     TablewiseEmbeddingBagConfig,
 )
-from colossalai.tensor import ColoTensor, ComputePattern, ComputeSpec, ProcessGroup, ShardSpec
+from colossalai.legacy.tensor import ComputePattern, ComputeSpec, ProcessGroup, ShardSpec
+from colossalai.tensor import ColoTensor
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 
 NUM_EMBED, EMBED_DIM = 10, 8
@@ -359,7 +360,7 @@ def run_parallel_freq_aware_embed_columnwise(rank, world_size):
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     # run_parallel_freq_aware_embed_columnwise(rank, world_size)
     run_parallel_freq_aware_embed_tablewise(rank, world_size)
 
diff --git a/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
index 7ff91a7b7..ac9493ada 100644
--- a/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
+++ b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py
@@ -1,7 +1,7 @@
 import torch
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn import TransformerSelfAttentionRing
 from colossalai.utils import get_current_device
 
diff --git a/tests/test_legacy/test_layers/test_sequence/test_sequence.py b/tests/test_legacy/test_layers/test_sequence/test_sequence.py
index b9e6c1247..85226f9d9 100644
--- a/tests/test_legacy/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_legacy/test_layers/test_sequence/test_sequence.py
@@ -3,8 +3,8 @@ import torch
 import torch.distributed as dist
 
 import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_sequence import RingAV, RingQK
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
@@ -120,7 +120,7 @@ def check_ring_av(rank, world_size):
 
 
 def run_test(rank, world_size, port):
-    colossalai.launch(rank=rank, world_size=world_size, config=CONFIG, host='localhost', port=port)
+    colossalai.legacy.launch(rank=rank, world_size=world_size, config=CONFIG, host='localhost', port=port)
 
     # check_ring_qk(rank, world_size)
     check_ring_av(rank, world_size)
diff --git a/tests/test_pipeline/rpc_test_utils.py b/tests/test_legacy/test_pipeline/rpc_test_utils.py
similarity index 97%
rename from tests/test_pipeline/rpc_test_utils.py
rename to tests/test_legacy/test_pipeline/rpc_test_utils.py
index dab474a4e..9a336c422 100644
--- a/tests/test_pipeline/rpc_test_utils.py
+++ b/tests/test_legacy/test_pipeline/rpc_test_utils.py
@@ -10,9 +10,9 @@ from torch import nn
 from torch._C._distributed_rpc import _is_current_rpc_agent_set
 from torch.optim import SGD, Adam, Optimizer, RMSprop
 
-from colossalai import launch
+from colossalai.legacy import launch
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
 from colossalai.logging import disable_existing_loggers
-from colossalai.pipeline.pipeline_process_group import ppg
 
 rpc_is_initialized = _is_current_rpc_agent_set
 
diff --git a/tests/test_pipeline/test_cuda_rpc_chimera.py b/tests/test_legacy/test_pipeline/test_cuda_rpc_chimera.py
similarity index 94%
rename from tests/test_pipeline/test_cuda_rpc_chimera.py
rename to tests/test_legacy/test_pipeline/test_cuda_rpc_chimera.py
index 45ad8f828..3bff08318 100644
--- a/tests/test_pipeline/test_cuda_rpc_chimera.py
+++ b/tests/test_legacy/test_pipeline/test_cuda_rpc_chimera.py
@@ -1,10 +1,10 @@
 import torch
-from torch import nn
 import torch.autograd as autograd
+from rpc_test_utils import RpcTestModel, parse_args, rpc_run
+from torch import nn
 
-from colossalai.pipeline.rpc import ChimeraPipelineEngine
+from colossalai.legacy.pipeline.rpc import ChimeraPipelineEngine
 from colossalai.testing import assert_close
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
 
 # global variable for model created
 feat_num = 100
diff --git a/tests/test_pipeline/test_cuda_rpc_optimizer.py b/tests/test_legacy/test_pipeline/test_cuda_rpc_optimizer.py
similarity index 89%
rename from tests/test_pipeline/test_cuda_rpc_optimizer.py
rename to tests/test_legacy/test_pipeline/test_cuda_rpc_optimizer.py
index 842566730..eff031ff8 100644
--- a/tests/test_pipeline/test_cuda_rpc_optimizer.py
+++ b/tests/test_legacy/test_pipeline/test_cuda_rpc_optimizer.py
@@ -1,11 +1,10 @@
 import torch
-from torch import nn
-from torch import autograd
-from torch.optim import SGD, Adam, RMSprop, Optimizer
+from rpc_test_utils import RpcTestModel, parse_args, rpc_run
+from torch import autograd, nn
+from torch.optim import SGD, Adam, Optimizer, RMSprop
 
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
+from colossalai.legacy.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 from colossalai.testing import assert_close
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
 
 # global variable for model created
 feat_num = 100
diff --git a/tests/test_pipeline/test_cuda_rpc_pipeline.py b/tests/test_legacy/test_pipeline/test_cuda_rpc_pipeline.py
similarity index 87%
rename from tests/test_pipeline/test_cuda_rpc_pipeline.py
rename to tests/test_legacy/test_pipeline/test_cuda_rpc_pipeline.py
index 8d03e7981..1a6077f8d 100644
--- a/tests/test_pipeline/test_cuda_rpc_pipeline.py
+++ b/tests/test_legacy/test_pipeline/test_cuda_rpc_pipeline.py
@@ -1,8 +1,8 @@
 import torch
+from rpc_test_utils import RpcTestModel, parse_args, rpc_run
 from torch import nn
 
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
+from colossalai.legacy.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 
 # global variable for model created
 feat_num = 100
diff --git a/tests/test_pipeline/test_cuda_rpc_value_correctness.py b/tests/test_legacy/test_pipeline/test_cuda_rpc_value_correctness.py
similarity index 91%
rename from tests/test_pipeline/test_cuda_rpc_value_correctness.py
rename to tests/test_legacy/test_pipeline/test_cuda_rpc_value_correctness.py
index e6713478b..43966ce3d 100644
--- a/tests/test_pipeline/test_cuda_rpc_value_correctness.py
+++ b/tests/test_legacy/test_pipeline/test_cuda_rpc_value_correctness.py
@@ -1,10 +1,9 @@
 import torch
-from torch import nn
-from torch import autograd
+from rpc_test_utils import RpcTestModel, parse_args, rpc_run
+from torch import autograd, nn
 
-from colossalai.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
+from colossalai.legacy.pipeline.rpc._pipeline_schedule import FillDrainPipelineEngine, OneFOneBPipelineEngine
 from colossalai.testing import assert_close
-from rpc_test_utils import rpc_run, parse_args, RpcTestModel
 
 feat_num = 100
 h = 100
diff --git a/tests/test_pipeline/test_middleware_1f1b.py b/tests/test_legacy/test_pipeline/test_middleware_1f1b.py
similarity index 94%
rename from tests/test_pipeline/test_middleware_1f1b.py
rename to tests/test_legacy/test_pipeline/test_middleware_1f1b.py
index 5b3aad703..4e43d52f8 100644
--- a/tests/test_pipeline/test_middleware_1f1b.py
+++ b/tests/test_legacy/test_pipeline/test_middleware_1f1b.py
@@ -7,13 +7,13 @@ import torch.distributed.rpc as rpc
 from rpc_test_utils import DAG_MLP, MLP
 from torch._C._distributed_rpc import _is_current_rpc_agent_set
 
-from colossalai import launch
 from colossalai.fx import ColoTracer
 from colossalai.fx.passes.adding_split_node_pass import balanced_split_pass, split_with_split_nodes_pass
+from colossalai.legacy import launch
+from colossalai.legacy.pipeline.middleware.adaptor import get_fx_topology
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
+from colossalai.legacy.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
 from colossalai.logging import disable_existing_loggers
-from colossalai.pipeline.middleware.adaptor import get_fx_topology
-from colossalai.pipeline.pipeline_process_group import ppg
-from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
 # global variable for model created
diff --git a/tests/test_pipeline/test_pipelinable.py b/tests/test_legacy/test_pipeline/test_pipelinable.py
similarity index 96%
rename from tests/test_pipeline/test_pipelinable.py
rename to tests/test_legacy/test_pipeline/test_pipelinable.py
index bb016596b..2ba5d0aa2 100644
--- a/tests/test_pipeline/test_pipelinable.py
+++ b/tests/test_legacy/test_pipeline/test_pipelinable.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from colossalai.pipeline.pipelinable import PipelinableContext
+from colossalai.legacy.pipeline.pipelinable import PipelinableContext
 from colossalai.testing import rerun_if_address_is_in_use, rerun_on_exception, spawn
 
 NUM_CHUNKS = 1
diff --git a/tests/test_pipeline/test_pipeline_process_group.py b/tests/test_legacy/test_pipeline/test_pipeline_process_group.py
similarity index 91%
rename from tests/test_pipeline/test_pipeline_process_group.py
rename to tests/test_legacy/test_pipeline/test_pipeline_process_group.py
index 2a00e3ac5..e6b956602 100644
--- a/tests/test_pipeline/test_pipeline_process_group.py
+++ b/tests/test_legacy/test_pipeline/test_pipeline_process_group.py
@@ -3,9 +3,9 @@ import os
 import torch.distributed.rpc as rpc
 from rpc_test_utils import pg_parse_args, rpc_is_initialized
 
-from colossalai.initialize import launch
+from colossalai.legacy.initialize import launch
+from colossalai.legacy.pipeline.pipeline_process_group import ppg
 from colossalai.logging import disable_existing_loggers
-from colossalai.pipeline.pipeline_process_group import ppg
 from colossalai.testing import spawn
 
 
diff --git a/tests/test_tensor/common_utils/__init__.py b/tests/test_legacy/test_tensor/common_utils/__init__.py
similarity index 95%
rename from tests/test_tensor/common_utils/__init__.py
rename to tests/test_legacy/test_tensor/common_utils/__init__.py
index 5387db704..9a35d02ce 100644
--- a/tests/test_tensor/common_utils/__init__.py
+++ b/tests/test_legacy/test_tensor/common_utils/__init__.py
@@ -1 +1 @@
-from ._utils import *
+from ._utils import *
diff --git a/tests/test_tensor/common_utils/_utils.py b/tests/test_legacy/test_tensor/common_utils/_utils.py
similarity index 93%
rename from tests/test_tensor/common_utils/_utils.py
rename to tests/test_legacy/test_tensor/common_utils/_utils.py
index b405f8cd2..b6fea28e4 100644
--- a/tests/test_tensor/common_utils/_utils.py
+++ b/tests/test_legacy/test_tensor/common_utils/_utils.py
@@ -6,9 +6,9 @@ import torch
 import torch.distributed as dist
 from torch.testing import assert_close
 
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.tensor import ComputePattern, ComputeSpec, ShardSpec
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.tensor import ComputePattern, ComputeSpec, ShardSpec
 
 
 def set_seed(seed):
diff --git a/tests/test_tensor/core/test_dist_spec_mgr.py b/tests/test_legacy/test_tensor/core/test_dist_spec_mgr.py
similarity index 91%
rename from tests/test_tensor/core/test_dist_spec_mgr.py
rename to tests/test_legacy/test_tensor/core/test_dist_spec_mgr.py
index 89476a35b..b6d6bcee6 100644
--- a/tests/test_tensor/core/test_dist_spec_mgr.py
+++ b/tests/test_legacy/test_tensor/core/test_dist_spec_mgr.py
@@ -5,7 +5,7 @@ import torch
 import torch.distributed as dist
 
 import colossalai
-from colossalai.tensor import DistSpecManager, ProcessGroup, ReplicaSpec, ShardSpec
+from colossalai.legacy.tensor import DistSpecManager, ProcessGroup, ReplicaSpec, ShardSpec
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 
@@ -48,7 +48,7 @@ def check_mem():
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     check_mem()
     run()
 
diff --git a/tests/test_tensor/test_parameter.py b/tests/test_legacy/test_tensor/test_parameter.py
similarity index 82%
rename from tests/test_tensor/test_parameter.py
rename to tests/test_legacy/test_tensor/test_parameter.py
index 9c3f05da1..7a8694ff6 100644
--- a/tests/test_tensor/test_parameter.py
+++ b/tests/test_legacy/test_tensor/test_parameter.py
@@ -3,13 +3,13 @@ import torch
 from common_utils import tensor_equal
 
 import colossalai
-from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ProcessGroup
+from colossalai.tensor import ColoParameter, ColoTensor
 from colossalai.testing import free_port
 
 
 @pytest.mark.skip
 def test_multiinheritance():
-    colossalai.launch(config={}, rank=0, world_size=1, host='localhost', port=free_port(), backend='nccl')
+    colossalai.legacy.launch(config={}, rank=0, world_size=1, host='localhost', port=free_port(), backend='nccl')
     colo_param = ColoParameter(None, requires_grad=True)
     assert colo_param.dist_spec.placement.value == 'r'
     assert isinstance(colo_param, ColoTensor)
diff --git a/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py
index 5fb678525..84652093a 100644
--- a/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py
+++ b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py
@@ -5,9 +5,6 @@ import pytest
 import torch
 import torch.distributed as dist
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
 from colossalai.legacy.communication import (
     recv_backward,
     recv_forward,
@@ -18,6 +15,9 @@ from colossalai.legacy.communication import (
     send_forward_recv_backward,
     send_obj_meta,
 )
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
 from colossalai.logging import get_dist_logger
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
diff --git a/tests/test_legacy/test_trainer/test_pipeline/test_pipeline_schedule.py b/tests/test_legacy/test_trainer/test_pipeline/test_pipeline_schedule.py
index 6d7bf6b3d..fd94c279b 100644
--- a/tests/test_legacy/test_trainer/test_pipeline/test_pipeline_schedule.py
+++ b/tests/test_legacy/test_trainer/test_pipeline/test_pipeline_schedule.py
@@ -11,11 +11,11 @@ from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 
 import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
+from colossalai.legacy.utils import get_dataloader, print_rank_0
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_dataloader, print_rank_0
 
 BATCH_SIZE = 8
 
@@ -63,7 +63,7 @@ def run_schedule(rank, world_size, port):
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
 
     # initialize
-    engine, train_dataloader, _, _ = colossalai.initialize(model, optimizer, criterion, train_dataloader)
+    engine, train_dataloader, _, _ = colossalai.legacy.initialize(model, optimizer, criterion, train_dataloader)
 
     # build pipeline schedule
     schedule = engine.schedule
diff --git a/tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py b/tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py
index dab0e53a4..4a2405334 100644
--- a/tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py
+++ b/tests/test_legacy/test_trainer/test_trainer_with_non_pipe_schedule.py
@@ -2,7 +2,7 @@ import pytest
 import torch
 
 import colossalai
-from colossalai.amp.amp_type import AMP_TYPE
+from colossalai.legacy.amp.amp_type import AMP_TYPE
 from colossalai.legacy.trainer import Trainer
 from colossalai.logging import get_dist_logger
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
@@ -22,10 +22,10 @@ def run_trainer(model_name):
     model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
     model = model_builder()
     optimizer = optimizer_class(model.parameters(), lr=1e-3)
-    engine, train_dataloader, *_ = colossalai.initialize(model=model,
-                                                         optimizer=optimizer,
-                                                         criterion=criterion,
-                                                         train_dataloader=train_dataloader)
+    engine, train_dataloader, *_ = colossalai.legacy.initialize(model=model,
+                                                                optimizer=optimizer,
+                                                                criterion=criterion,
+                                                                train_dataloader=train_dataloader)
 
     logger = get_dist_logger()
     logger.info("engine is built", ranks=[0])
@@ -45,7 +45,12 @@ def run_trainer(model_name):
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config=CONFIG,
+                             rank=rank,
+                             world_size=world_size,
+                             host='localhost',
+                             port=port,
+                             backend='nccl')
 
 
 @pytest.mark.dist
diff --git a/tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py b/tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py
index 7dfbec854..521b2f32f 100644
--- a/tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py
+++ b/tests/test_legacy/test_trainer/test_trainer_with_pipe_schedule.py
@@ -10,12 +10,13 @@ from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 
 import colossalai
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.trainer import Trainer
+from colossalai.legacy.utils import get_dataloader
 from colossalai.logging import get_dist_logger
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.utils import MultiTimer
 
 BATCH_SIZE = 4
 IMG_SIZE = 32
@@ -28,7 +29,12 @@ CONFIG = dict(
 
 
 def run_trainer_with_pipeline(rank, world_size, port):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config=CONFIG,
+                             rank=rank,
+                             world_size=world_size,
+                             host='localhost',
+                             port=port,
+                             backend='nccl')
 
     # build model
     model = resnet18(num_classes=10)
@@ -63,10 +69,10 @@ def run_trainer_with_pipeline(rank, world_size, port):
     optimizer = Adam(model.parameters(), lr=0.001)
     criterion = nn.CrossEntropyLoss()
 
-    engine, train_dataloader, *args = colossalai.initialize(model=model,
-                                                            optimizer=optimizer,
-                                                            criterion=criterion,
-                                                            train_dataloader=train_dataloader)
+    engine, train_dataloader, *args = colossalai.legacy.initialize(model=model,
+                                                                   optimizer=optimizer,
+                                                                   criterion=criterion,
+                                                                   train_dataloader=train_dataloader)
 
     logger = get_dist_logger()
     logger.info("engine is built", ranks=[0])
diff --git a/tests/test_utils/test_activation_checkpointing.py b/tests/test_legacy/test_utils/test_activation_checkpointing.py
similarity index 94%
rename from tests/test_utils/test_activation_checkpointing.py
rename to tests/test_legacy/test_utils/test_activation_checkpointing.py
index b7764c2f4..19984ae12 100644
--- a/tests/test_utils/test_activation_checkpointing.py
+++ b/tests/test_legacy/test_utils/test_activation_checkpointing.py
@@ -5,10 +5,10 @@ import pytest
 import torch
 import torch.nn.functional as F
 
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.context.random import add_seed, reset_seeds, seed, set_mode
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.context.random import add_seed, reset_seeds, seed, set_mode
+from colossalai.legacy.utils.activation_checkpoint import checkpoint
 from colossalai.testing import clear_cache_before_run, parameterize
-from colossalai.utils.activation_checkpoint import checkpoint
 
 
 def forward(x, weight):
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_1d.py
similarity index 83%
rename from tests/test_utils/test_checkpoint/test_checkpoint_1d.py
rename to tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_1d.py
index 9c3a7e216..88cd89a21 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py
+++ b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_1d.py
@@ -8,17 +8,17 @@ import torch
 import torch.nn as nn
 
 import colossalai.legacy.nn as col_nn
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
+from colossalai.legacy.utils import is_using_pp
+from colossalai.legacy.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
-from colossalai.utils import is_using_pp
-from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 
 
 def build_pipeline(model):
-    from colossalai.pipeline.utils import partition_uniform
+    from colossalai.legacy.pipeline.utils import partition_uniform
 
     pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
     pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_2d.py
similarity index 83%
rename from tests/test_utils/test_checkpoint/test_checkpoint_2d.py
rename to tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_2d.py
index 03b2e4f2a..591cd714f 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py
+++ b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_2d.py
@@ -8,17 +8,17 @@ import torch
 import torch.nn as nn
 
 import colossalai.legacy.nn as col_nn
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
+from colossalai.legacy.utils import is_using_pp
+from colossalai.legacy.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
-from colossalai.utils import is_using_pp
-from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 
 
 def build_pipeline(model):
-    from colossalai.pipeline.utils import partition_uniform
+    from colossalai.legacy.pipeline.utils import partition_uniform
 
     pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
     pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_2p5d.py
similarity index 84%
rename from tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
rename to tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_2p5d.py
index cafffd0a6..b165b4276 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
+++ b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_2p5d.py
@@ -8,17 +8,17 @@ import torch
 import torch.nn as nn
 
 import colossalai.legacy.nn as col_nn
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
+from colossalai.legacy.utils import is_using_pp
+from colossalai.legacy.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
-from colossalai.utils import is_using_pp
-from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 
 
 def build_pipeline(model):
-    from colossalai.pipeline.utils import partition_uniform
+    from colossalai.legacy.pipeline.utils import partition_uniform
 
     pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
     pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_3d.py
similarity index 83%
rename from tests/test_utils/test_checkpoint/test_checkpoint_3d.py
rename to tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_3d.py
index 9b43be9e8..2ce054d33 100644
--- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py
+++ b/tests/test_legacy/test_utils/test_checkpoint/test_checkpoint_3d.py
@@ -8,17 +8,17 @@ import torch
 import torch.nn as nn
 
 import colossalai.legacy.nn as col_nn
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.initialize import launch
+from colossalai.legacy.utils import is_using_pp
+from colossalai.legacy.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
-from colossalai.utils import is_using_pp
-from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
 
 
 def build_pipeline(model):
-    from colossalai.pipeline.utils import partition_uniform
+    from colossalai.legacy.pipeline.utils import partition_uniform
 
     pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
     pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
diff --git a/tests/test_utils/test_memory.py b/tests/test_legacy/test_utils/test_memory.py
similarity index 76%
rename from tests/test_utils/test_memory.py
rename to tests/test_legacy/test_utils/test_memory.py
index c88c2f8ec..2e25dc773 100644
--- a/tests/test_utils/test_memory.py
+++ b/tests/test_legacy/test_utils/test_memory.py
@@ -1,9 +1,9 @@
 import pytest
 
 import colossalai
+from colossalai.legacy.utils.memory import colo_device_memory_capacity, colo_set_process_memory_fraction
 from colossalai.testing import spawn
 from colossalai.utils.cuda import get_current_device
-from colossalai.utils.memory import colo_device_memory_capacity, colo_set_process_memory_fraction
 
 
 def _run_colo_set_process_memory_fraction_and_colo_device_memory_capacity():
@@ -14,7 +14,7 @@ def _run_colo_set_process_memory_fraction_and_colo_device_memory_capacity():
 
 
 def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     _run_colo_set_process_memory_fraction_and_colo_device_memory_capacity()
 
 
diff --git a/tests/test_utils/test_norm_gradient_clipping.py b/tests/test_legacy/test_utils/test_norm_gradient_clipping.py
similarity index 91%
rename from tests/test_utils/test_norm_gradient_clipping.py
rename to tests/test_legacy/test_utils/test_norm_gradient_clipping.py
index 4fd7c3c60..918f174ab 100644
--- a/tests/test_utils/test_norm_gradient_clipping.py
+++ b/tests/test_legacy/test_utils/test_norm_gradient_clipping.py
@@ -4,12 +4,12 @@ from torch.nn.parameter import Parameter
 from torch.nn.utils import clip_grad_norm_
 
 import colossalai
+from colossalai.legacy.tensor import ColoTensorSpec, ProcessGroup, distspec
+from colossalai.legacy.utils.common import clip_grad_norm
 from colossalai.logging import disable_existing_loggers
-from colossalai.tensor import ColoTensorSpec, ProcessGroup, distspec
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
-from colossalai.utils.common import clip_grad_norm
 
 
 def close(num: float, other: float, rtol: float = 1e-5, atol: float = 1e-8):
@@ -62,7 +62,7 @@ def run_grad_clip_norm(world_size: int, dtype: torch.dtype, device: str, norm_ty
 
 def run_dist(rank, world_size, port):
     disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     run_grad_clip_norm(world_size=world_size)
 
 
diff --git a/tests/test_utils/test_commons.py b/tests/test_legacy/test_zero/test_commons.py
similarity index 82%
rename from tests/test_utils/test_commons.py
rename to tests/test_legacy/test_zero/test_commons.py
index 2633d7da2..42a9f1eec 100644
--- a/tests/test_utils/test_commons.py
+++ b/tests/test_legacy/test_zero/test_commons.py
@@ -1,13 +1,13 @@
 import torch
 
 import colossalai
+from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
+from colossalai.legacy.zero.sharded_param import ShardedTensor
 from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move, colo_model_data_tensor_move_inline
-from colossalai.zero.legacy.sharded_param import ShardedTensor
 
 
 def run_tensor_move(rank, world_size, port):
-    colossalai.launch(config={}, rank=0, world_size=world_size, host='localhost', port=port, backend='nccl')
+    colossalai.legacy.launch(config={}, rank=0, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     src_t = torch.ones(2, 3).cuda()
     tgt_t = torch.zeros(2, 3)
diff --git a/tests/test_moe/test_kernel.py b/tests/test_moe/test_kernel.py
index 39603c158..c096b6075 100644
--- a/tests/test_moe/test_kernel.py
+++ b/tests/test_moe/test_kernel.py
@@ -3,9 +3,9 @@ import torch
 import torch.nn as nn
 
 import colossalai
-from colossalai.context import ParallelMode
 from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.nn.layer.moe import Experts, MoeLayer, Top1Router, Top2Router
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
diff --git a/tests/test_moe/test_moe_zero_optim.py b/tests/test_moe/test_moe_zero_optim.py
index a43ae764d..35fde6f10 100644
--- a/tests/test_moe/test_moe_zero_optim.py
+++ b/tests/test_moe/test_moe_zero_optim.py
@@ -2,8 +2,8 @@ import pytest
 import torch
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp
 from colossalai.context import MOE_CONTEXT
+from colossalai.legacy.amp import convert_to_apex_amp
 from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
 from colossalai.nn import MoeLoss
 from colossalai.nn.optimizer import CPUAdam
diff --git a/tests/test_tensor/test_comm_spec_apply.py b/tests/test_tensor/test_comm_spec_apply.py
index 2c68633aa..4a3199c1c 100644
--- a/tests/test_tensor/test_comm_spec_apply.py
+++ b/tests/test_tensor/test_comm_spec_apply.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
+import torch.distributed as dist
 
-from colossalai.core import global_context as gpc
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
@@ -184,7 +184,7 @@ def check_comm(rank, world_size, port):
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     physical_mesh_id = torch.arange(0, 4)
-    assert rank == gpc.get_global_rank()
+    assert rank == dist.get_rank()
 
     mesh_shape = (2, 2)
     # [[0, 1,
@@ -205,7 +205,6 @@ def check_comm(rank, world_size, port):
 
     # test all reduce in 1D flatten device mesh
     check_all_reduce_in_flatten_device_mesh(device_mesh, rank)
-    gpc.destroy()
 
 
 @pytest.mark.dist
diff --git a/tests/test_tensor/test_dtensor/test_comm_spec.py b/tests/test_tensor/test_dtensor/test_comm_spec.py
index 95fcd2aaf..a1ea2946e 100644
--- a/tests/test_tensor/test_dtensor/test_comm_spec.py
+++ b/tests/test_tensor/test_dtensor/test_comm_spec.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
+import torch.distributed as dist
 
-from colossalai.core import global_context as gpc
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
@@ -127,7 +127,7 @@ def check_comm(rank, world_size, port):
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     physical_mesh_id = torch.arange(0, 4)
-    assert rank == gpc.get_global_rank()
+    assert rank == dist.get_rank()
 
     mesh_shape = (2, 2)
     # [[0, 1,
@@ -149,8 +149,6 @@ def check_comm(rank, world_size, port):
     check_all_reduce_fwd(process_group_dict, rank)
     check_all_reduce_bwd(process_group_dict, rank)
 
-    gpc.destroy()
-
 
 @pytest.mark.dist
 @rerun_if_address_is_in_use()
diff --git a/tests/test_tensor/test_mix_gather.py b/tests/test_tensor/test_mix_gather.py
index 9122808eb..bd71bffcc 100644
--- a/tests/test_tensor/test_mix_gather.py
+++ b/tests/test_tensor/test_mix_gather.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
+import torch.distributed as dist
 
-from colossalai.core import global_context as gpc
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.initialize import launch
 from colossalai.logging import disable_existing_loggers
@@ -295,7 +295,7 @@ def check_comm(rank, world_size, port):
     launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     physical_mesh_id = torch.arange(0, 8)
-    assert rank == gpc.get_global_rank()
+    assert rank == dist.get_rank()
 
     mesh_shape = (2, 4)
     # [[0, 1, 2, 3],
diff --git a/tests/test_utils/test_zero_gradient_clippling.py b/tests/test_utils/test_zero_gradient_clippling.py
deleted file mode 100644
index e99cf388e..000000000
--- a/tests/test_utils/test_zero_gradient_clippling.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from functools import partial
-
-import pytest
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.nn.utils import clip_grad_norm_
-
-import colossalai
-from colossalai.logging import disable_existing_loggers
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import checkpoint, clip_grad_norm_fp32
-from colossalai.zero.legacy.shard_utils.tensor_shard_strategy import TensorShardStrategy
-from colossalai.zero.legacy.sharded_model.sharded_model_v2 import ShardedModelV2
-
-
-def checkpoint_wrapper(module, enable=True):
-    if enable:
-        module.forward = partial(checkpoint, module.forward, False)
-    return module
-
-
-class Net(nn.Module):
-
-    def __init__(self, checkpoint=False) -> None:
-        super().__init__()
-        self.fc1 = nn.Linear(5, 5)
-        self.fc2 = nn.Linear(5, 5)
-        self.fc3 = nn.Linear(5, 1)
-        if checkpoint:
-            self.fc1 = checkpoint_wrapper(self.fc1)
-        self.layers = [self.fc1, self.fc2, self.fc1, self.fc2, self.fc3]
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
-def run_step(model, optimizer, x, enable_autocast=False, norm_type=2.0):
-    model.train()
-    optimizer.zero_grad()
-    with torch.cuda.amp.autocast(enabled=enable_autocast):
-        y = model(x)
-        loss = y.sum()
-    loss = loss.float()
-    loss.backward()
-    clip_grad(model, norm_type)
-    optimizer.step()
-
-
-def clip_grad(model, norm_type):
-    if isinstance(model, DDP):
-        clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=norm_type)
-    else:
-        clip_grad_norm_fp32(model.parameters(), max_norm=1.0, norm_type=norm_type)
-
-
-def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
-    if loose:
-        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
-    return torch.allclose(tensor_a, tensor_b)
-
-
-def check_grads(model, zero_model, loose=False):
-    rank = dist.get_rank()
-    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
-        zero_grad = zero_p.grad.clone().to(p.device)
-        chunks = torch.flatten(p.grad).chunk(4)
-        if rank >= len(chunks):
-            continue
-        grad = chunks[rank]
-        if zero_p.zero_shard_padding > 0:
-            zero_grad = zero_grad[:-zero_p.zero_shard_padding]
-        assert grad.dtype == zero_grad.dtype
-        assert allclose(grad, zero_grad, loose=loose)
-
-
-def check_params(model, zero_model, loose=False):
-    rank = dist.get_rank()
-    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
-        zero_shard_padding = zero_p.zero_shard_padding
-        zero_p = zero_p.clone().to(p.device)
-        chunks = torch.flatten(p).chunk(4)
-        if rank >= len(chunks):
-            continue
-        p = chunks[rank]
-        if zero_shard_padding > 0:
-            zero_p = zero_p[:-zero_shard_padding]
-        assert p.dtype == zero_p.dtype
-        assert allclose(p, zero_p, loose=loose)
-
-
-def run_dist(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_zero_clip_grad():
-    world_size = 4
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_zero_clip_grad()
diff --git a/tests/test_zero/test_gemini/test_chunk_mgrv2.py b/tests/test_zero/test_gemini/test_chunk_mgrv2.py
index d6c4f8bd8..f05ccfdbd 100644
--- a/tests/test_zero/test_gemini/test_chunk_mgrv2.py
+++ b/tests/test_zero/test_gemini/test_chunk_mgrv2.py
@@ -6,7 +6,6 @@ import colossalai
 from colossalai.tensor import ColoTensor
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.zero.gemini.chunk import ChunkManager
-from tests.test_tensor.common_utils import debug_print
 
 CUDA_MEM_0 = {False: 512, True: 1024}
 CUDA_MEM_1 = {False: 0, True: 1024}
@@ -16,7 +15,6 @@ CPU_MEM = {True: {True: 0, False: 0}, False: {True: 512, False: 0}}
 @parameterize('keep_gathered', [True, False])
 @parameterize('pin_memory', [True, False])
 def exam_chunk_memory(keep_gathered, pin_memory):
-    debug_print([0], "keep_gathered: {}, pin_memory: {}".format(keep_gathered, pin_memory))
 
     params = [ColoTensor(torch.rand(8, 8)) for _ in range(3)]
     config = {2: dict(chunk_size=128, keep_gathered=keep_gathered)}
diff --git a/tests/test_zero/test_gemini/test_fwd_bwd.py b/tests/test_zero/test_gemini/test_fwd_bwd.py
index 4cbf564ec..fabdd6072 100644
--- a/tests/test_zero/test_gemini/test_fwd_bwd.py
+++ b/tests/test_zero/test_gemini/test_fwd_bwd.py
@@ -5,15 +5,15 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp
+from colossalai.legacy.amp import convert_to_apex_amp
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.utils.cuda import get_current_device
 from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from tests.components_to_test import run_fwd_bwd
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 PLACEMENT_CONFIGS = [
     {
diff --git a/tests/test_zero/test_gemini/test_gemini_use_rmt.py b/tests/test_zero/test_gemini/test_gemini_use_rmt.py
index a80a2f62d..614a96ccd 100644
--- a/tests/test_zero/test_gemini/test_gemini_use_rmt.py
+++ b/tests/test_zero/test_gemini/test_gemini_use_rmt.py
@@ -4,12 +4,12 @@ import torch.distributed as dist
 
 import colossalai
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.zero import GeminiDDP
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from colossalai.zero.gemini.memory_tracer.runtime_mem_tracer import RuntimeMemTracer
 from tests.components_to_test import run_fwd_bwd
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 # run gemini use the runtime memory tracer
 
diff --git a/tests/test_zero/test_gemini/test_grad_clip.py b/tests/test_zero/test_gemini/test_grad_clip.py
index 82b9133b8..860d6efa8 100644
--- a/tests/test_zero/test_gemini/test_grad_clip.py
+++ b/tests/test_zero/test_gemini/test_grad_clip.py
@@ -5,14 +5,14 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp
+from colossalai.legacy.amp import convert_to_apex_amp
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from tests.components_to_test import run_fwd_bwd
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 PLACEMENT_CONFIGS = [
     {
diff --git a/tests/test_zero/test_gemini/test_inference.py b/tests/test_zero/test_gemini/test_inference.py
index 20d145f96..99ee08c1d 100644
--- a/tests/test_zero/test_gemini/test_inference.py
+++ b/tests/test_zero/test_gemini/test_inference.py
@@ -7,15 +7,15 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp
+from colossalai.legacy.amp import convert_to_apex_amp
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.utils.cuda import get_current_device
 from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from tests.components_to_test import run_fwd_bwd
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 PLACEMENT_CONFIGS = [
     {
diff --git a/tests/test_zero/test_gemini/test_optim.py b/tests/test_zero/test_gemini/test_optim.py
index edcbada0a..345495919 100644
--- a/tests/test_zero/test_gemini/test_optim.py
+++ b/tests/test_zero/test_gemini/test_optim.py
@@ -5,15 +5,15 @@ from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.amp import convert_to_apex_amp
+from colossalai.legacy.amp import convert_to_apex_amp
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.utils.cuda import get_current_device
 from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from tests.components_to_test import run_fwd_bwd
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 PLACEMENT_CONFIGS = [
     {
diff --git a/tests/test_zero/test_gemini/test_zeroddp_state_dict.py b/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
index 656bd709e..602e3ad35 100644
--- a/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
+++ b/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
@@ -4,10 +4,10 @@ from torch.testing import assert_close
 
 import colossalai
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.zero import GeminiDDP
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 PLACEMENT_CONFIGS = [
     {
diff --git a/tests/test_zero/test_gemini/test_zerooptim_state_dict.py b/tests/test_zero/test_gemini/test_zerooptim_state_dict.py
index 09725e11e..5f7b51510 100644
--- a/tests/test_zero/test_gemini/test_zerooptim_state_dict.py
+++ b/tests/test_zero/test_gemini/test_zerooptim_state_dict.py
@@ -5,10 +5,10 @@ import torch.distributed as dist
 import colossalai
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import set_seed
 from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.chunk import search_chunk_configuration
 from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import set_seed
 
 PLACEMENT_CONFIGS = [
     {
diff --git a/tests/test_zero/test_low_level/test_zero_tp.py b/tests/test_zero/test_low_level/test_zero_tp.py
deleted file mode 100644
index 4a2b49f63..000000000
--- a/tests/test_zero/test_low_level/test_zero_tp.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import pytest
-import torch
-import torch.nn as nn
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing import assert_close
-
-import colossalai
-from colossalai.tensor import ProcessGroup
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_current_device
-from colossalai.zero import ColoInitContext, LowLevelZeroOptimizer
-from tests.test_tensor.common_utils import set_seed, split_param_col_tp1d, split_param_row_tp1d, tensor_shard_equal
-
-
-def strict_shard_equal(tensor, shard, tp_pg, rtol=1e-3, atol=1e-4):
-    return tensor_shard_equal(tensor, shard, tp_pg.tp_local_rank(), tp_pg.tp_world_size(), rtol, atol)
-
-
-class MlpModel(nn.Module):
-
-    def __init__(self):
-        super(MlpModel, self).__init__()
-        self.linear1 = nn.Linear(32, 128)
-        self.act = nn.GELU()
-        self.linear2 = nn.Linear(128, 32)
-
-    def forward(self, x):
-        y = self.linear1(x)
-        y = self.act(y)
-        y = self.linear2(y)
-        return x + y
-
-
-@parameterize("overlap_flag", [False, True])
-@parameterize("partition_flag", [False, True])
-def exam_zero_with_tp(overlap_flag, partition_flag):
-    set_seed(233010)
-    tp_pg = ProcessGroup(tp_degree=2)
-
-    with ColoInitContext(device=get_current_device(), default_pg=tp_pg):
-        hybrid_model = MlpModel()
-    torch_model = MlpModel().cuda()
-    for pt, ph in zip(torch_model.parameters(), hybrid_model.parameters()):
-        pt.data.copy_(ph.data)
-
-    for name, param in hybrid_model.named_parameters():
-        if 'linear1' in name:
-            split_param_row_tp1d(param, tp_pg)
-            param.compute_spec.set_output_replicate(False)
-        if 'linear2.weight' in name:
-            split_param_col_tp1d(param, tp_pg)
-
-    torch_model = DDP(torch_model, device_ids=[tp_pg.rank()], process_group=tp_pg.dp_process_group())
-    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-2)    # set to 1e-2 for torch-1.11
-    hybrid_optim = torch.optim.Adam(hybrid_model.parameters(), lr=1e-2)
-    hybrid_optim = LowLevelZeroOptimizer(hybrid_optim,
-                                         initial_scale=2,
-                                         clip_grad_norm=1.0,
-                                         overlap_communication=overlap_flag,
-                                         partition_grad=partition_flag,
-                                         dp_process_group=tp_pg.dp_process_group(),
-                                         tp_process_group=tp_pg.tp_process_group())
-
-    dp_local_rank = tp_pg.dp_local_rank()
-    set_seed(255 + dp_local_rank)
-
-    data = torch.randn(8, 32, device=get_current_device())
-    torch_loss = torch_model(data).sum()
-    hybrid_loss = hybrid_model(data).sum()
-    assert_close(torch_loss, hybrid_loss)
-
-    torch_loss.backward()
-    torch.nn.utils.clip_grad_norm_(torch_model.parameters(), 1.0)
-    hybrid_optim.backward(hybrid_loss)
-
-    torch_optim.step()
-    hybrid_optim.step()
-
-    for (name, pt), ph in zip(torch_model.named_parameters(), hybrid_model.parameters()):
-        assert strict_shard_equal(pt.data, ph.data, tp_pg)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    exam_zero_with_tp()
-
-
-@pytest.mark.skip('this will be rewritten by shardformer')
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_zero_with_tp():
-    spawn(run_dist, 4)
-
-
-if __name__ == '__main__':
-    test_zero_with_tp()