[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
1 year ago · b5f9e37c70
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2917 additions and 4180 deletions
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@ -89,7 +89,7 @@ jobs:
      - name: Install ColossalAI
        run: |
          source activate pytorch
-          pip install -v .
+          CUDA_EXT=1 pip install -v .

      - name: Test the Doc
        run: |
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@ -32,7 +32,7 @@ jobs:

      - name: Install ColossalAI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .

      - name: Install Doc Test Requirements
        run: |
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@ -53,7 +53,7 @@ jobs:
        uses: actions/checkout@v3
      - name: Install Colossal-AI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .
      - name: Test the example
        run: |
          dir=${{ matrix.directory }}
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@ -88,7 +88,7 @@ jobs:

      - name: Install Colossal-AI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .

      - name: Test the example
        run: |
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@ -42,7 +42,7 @@ jobs:

      - name: Install Colossal-AI
        run: |
-          pip install -v .
+          CUDA_EXT=1 pip install -v .

      - name: Traverse all files
        run: |
--- a/colossalai/init.py
+++ b/colossalai/init.py
@ -1,11 +1,4 @@
-from .initialize import (
-    get_default_parser,
-    initialize,
-    launch,
-    launch_from_openmpi,
-    launch_from_slurm,
-    launch_from_torch,
-)
+from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch

 try:
    # .version will be created by setup.py
@ -15,3 +8,5 @@ except ModuleNotFoundError:
    # and directly set PYTHONPATH to use Colossal-AI which is a bad practice
    __version__ = '0.0.0'
    print('please install Colossal-AI from https://www.colossalai.org/download or from source')
+
+__all__ = ['launch', 'launch_from_openmpi', 'launch_from_slurm', 'launch_from_torch', '__version__']
--- a/colossalai/amp/init.py
+++ b/colossalai/amp/init.py
@ -1,54 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch.nn as nn
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-
-from colossalai.context import Config
-
-from .amp_type import AMP_TYPE
-from .apex_amp import convert_to_apex_amp
-from .naive_amp import convert_to_naive_amp
-from .torch_amp import convert_to_torch_amp
-
-__all__ = ['convert_to_amp', 'convert_to_naive_amp', 'convert_to_apex_amp', 'convert_to_torch_amp', 'AMP_TYPE']
-
-
-def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
-    """A helper function to wrap training components with Torch AMP modules.
-
-    Args:
-        param model (:class:`torch.nn.Module`): your model object.
-        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
-        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
-        mode (:class:`colossalai.amp.AMP_TYPE`): amp mode.
-        amp_config (Union[:class:`colossalai.context.Config`, dict]): configuration for different amp modes.
-
-    Returns:
-        A tuple (model, optimizer, criterion).
-
-    Note:
-        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
-        for more details about ``amp_config``.
-        For ``apex_amp``, please check
-        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
-        For ``naive_amp``, please check
-        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
-        For ``torch_amp``, please check
-        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
-    """
-    assert isinstance(mode, AMP_TYPE), \
-        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
-
-    if amp_config is None:
-        amp_config = Config()
-
-    if mode == AMP_TYPE.TORCH:
-        model, optimizer, criterion = convert_to_torch_amp(model, optimizer, criterion, amp_config)
-    elif mode == AMP_TYPE.APEX:
-        model, optimizer = convert_to_apex_amp(model, optimizer, amp_config)
-    elif mode == AMP_TYPE.NAIVE:
-        model, optimizer = convert_to_naive_amp(model, optimizer, amp_config)
-
-    return model, optimizer, criterion
--- a/colossalai/amp/naive_amp/init.py
+++ b/colossalai/amp/naive_amp/init.py
@ -1,60 +0,0 @@
-import inspect
-
-import torch.nn as nn
-from torch.optim import Optimizer
-
-from colossalai.utils import is_no_pp_or_last_stage
-
-from ._fp16_optimizer import FP16Optimizer
-from .grad_scaler import ConstantGradScaler, DynamicGradScaler
-from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer
-
-
-def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
-    """A helper function to wrap training components with naive AMP modules. In this mode,
-    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
-    which is equivalent to Apex O3.
-
-    Args:
-        model (:class:`torch.nn.Module`): your model object
-        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
-        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
-
-    Returns:
-        Tuple: A tuple (model, optimizer)
-
-    The ``amp_config`` should contain parameters below::
-
-        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
-        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
-                                          Note that clipping is ignored if clip_grad == 0.
-        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
-    """
-    if isinstance(model, nn.ModuleList):
-        # interleaved pipeline
-        module_list = []
-        for chunk, m in enumerate(model):
-            output_to_fp32 = is_no_pp_or_last_stage() and chunk == len(model) - 1
-            module_list.append(NaiveAMPModel(m, output_to_fp32=output_to_fp32))
-        model = nn.ModuleList(module_list)
-    else:
-        output_to_fp32 = is_no_pp_or_last_stage()
-        model = NaiveAMPModel(model, output_to_fp32=output_to_fp32)
-
-    use_dynamic_grad_scaler = amp_config.pop('dynamic_grad_scale', True)
-    if use_dynamic_grad_scaler:
-        scaler_class = DynamicGradScaler
-    else:
-        scaler_class = ConstantGradScaler
-
-    sig = inspect.signature(scaler_class.__init__)
-    kwargs = dict()
-    for param in sig.parameters.values():
-        if param.name in amp_config:
-            kwargs[param.name] = amp_config.pop(param.name)
-    grad_scaler = scaler_class(**kwargs)
-    optimizer = NaiveAMPOptimizer(optimizer, grad_scaler, **amp_config)
-    return model, optimizer
-
-
-__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
--- a/colossalai/auto_parallel/offload/amp_optimizer.py
+++ b/colossalai/auto_parallel/offload/amp_optimizer.py
@ -5,8 +5,8 @@ import torch
 from torch.optim import Optimizer

 from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import get_current_device

 from .base_offload_module import BaseOffloadModule
@ -19,7 +19,7 @@ class OptimState(Enum):
    UNSCALED = 1


-class AMPOptimizer(ColossalaiOptimizer):
+class AMPOptimizer(OptimizerWrapper):
    """
    A wrapper for Optimizer.
    Code reference: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/optimizer/zero_optimizer.py
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@ -13,7 +13,6 @@ import torch.nn as nn
 from torch.optim import Optimizer

 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.tensor.d_tensor import (
    is_customized_distributed_tensor,
    is_distributed_tensor,
@ -130,10 +129,7 @@ def unwrap_optimizer(optimizer: OptimizerWrapper):
    This method should be used before saving/loading it to/from sharded checkpoints.
    '''

-    # TODO(Baizhou): ColossalaiOptimizer will be replaced with OptimizerWrapper in the future
    unwrapped_optim = optimizer.optim
-    if isinstance(unwrapped_optim, ColossalaiOptimizer):
-        unwrapped_optim = unwrapped_optim.optim
    return unwrapped_optim


--- a/colossalai/cli/benchmark/init.py
+++ b/colossalai/cli/benchmark/init.py
@ -1,28 +0,0 @@
-import click
-
-from colossalai.context import Config
-
-from .benchmark import run_benchmark
-from .utils import *
-
-__all__ = ['benchmark']
-
-
-@click.command()
-@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
-@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
-@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
-@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
-@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
-@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
-@click.option("-l", "--layers", type=int, default=2)
-@click.option("-m",
-              "--model",
-              type=click.Choice(['mlp'], case_sensitive=False),
-              default='mlp',
-              help="Select the model to benchmark, currently only supports MLP")
-def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
-              layers: int, model: str):
-    args_dict = locals()
-    args = Config(args_dict)
-    run_benchmark(args)
--- a/colossalai/cli/benchmark/benchmark.py
+++ b/colossalai/cli/benchmark/benchmark.py
@ -1,105 +0,0 @@
-from functools import partial
-from typing import Dict, List
-
-import click
-import torch.multiprocessing as mp
-
-import colossalai
-from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
-from colossalai.context import Config
-from colossalai.context.random import reset_seeds
-from colossalai.core import global_context as gpc
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.testing import free_port
-from colossalai.utils import MultiTimer
-
-from .models import MLP
-
-
-def run_benchmark(args: Config) -> None:
-    """
-    Run benchmarking with torch.multiprocessing.
-    """
-
-    # sanity checks
-    if args.gpus is None:
-        click.echo("Error: --num_gpus is not given")
-        exit()
-    if args.gpus <= 1:
-        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
-
-    click.echo("=== Benchmarking Parameters ===")
-    for k, v in args.items():
-        click.echo(f'{k}: {v}')
-    click.echo('')
-
-    config_list = find_all_configs(args.gpus)
-
-    avail_ports = [free_port() for _ in range(len(config_list))]
-    run_func = partial(run_dist_profiling,
-                       world_size=args.gpus,
-                       port_list=avail_ports,
-                       config_list=config_list,
-                       hyperparams=args)
-    mp.spawn(run_func, nprocs=args.gpus)
-
-
-def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
-                       hyperparams: Config) -> None:
-    """
-    A function executed for profiling, this function should be spawn by torch.multiprocessing.
-
-    Args:
-        rank (int): rank of the process
-        world_size (int): the number of processes
-        port_list (List[int]): a list of free ports for initializing distributed networks
-        config_list (List[Dict]): a list of configuration
-        hyperparams (Config): the hyperparameters given by the user
-
-    """
-
-    # disable logging for clean output
-    disable_existing_loggers()
-    logger = get_dist_logger()
-    logger.set_level('WARNING')
-
-    for config, port in zip(config_list, port_list):
-        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-        timer = MultiTimer()
-
-        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
-        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
-            click.echo(
-                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
-            )
-            continue
-
-        if hyperparams.model == 'mlp':
-            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
-        else:
-            if gpc.get_global_rank() == 0:
-                click.echo("Error: Invalid argument for --model")
-                exit()
-
-        data_func = partial(get_batch_data,
-                            dim=hyperparams.dimension,
-                            batch_size=hyperparams.batch_size,
-                            seq_length=hyperparams.seq_len,
-                            mode=config.parallel.tensor.mode)
-
-        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
-                                                                      warmup_steps=hyperparams.warmup_steps,
-                                                                      profile_steps=hyperparams.profile_steps,
-                                                                      data_func=data_func,
-                                                                      timer=timer)
-
-        gpc.destroy()
-        reset_seeds()
-
-        if gpc.get_global_rank() == 0:
-            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
-            click.echo(f"=== {config_str} ===")
-            click.echo(f"Average forward time: {fwd_time}")
-            click.echo(f"Average backward time: {bwd_time}")
-            click.echo(f"Max allocated GPU memory: {max_allocated}")
-            click.echo(f"Max cached GPU memory: {max_cached}\n")
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
@ -1,18 +0,0 @@
-import torch
-
-import colossalai.legacy.nn as col_nn
-
-
-class MLP(torch.nn.Module):
-
-    def __init__(self, dim: int, layers: int):
-        super().__init__()
-        self.layers = torch.nn.ModuleList()
-
-        for _ in range(layers):
-            self.layers.append(col_nn.Linear(dim, dim))
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
--- a/colossalai/cli/benchmark/utils.py
+++ b/colossalai/cli/benchmark/utils.py
@ -1,159 +0,0 @@
-import math
-import time
-from typing import Callable, Dict, List, Tuple
-
-import torch
-
-from colossalai.context import Config, ParallelMode
-from colossalai.utils import MultiTimer
-
-
-def get_time_stamp() -> int:
-    """
-    Return the time stamp for profiling.
-
-    Returns:
-        time_stamp (int): the time given by time.time()
-    """
-
-    torch.cuda.synchronize()
-    time_stamp = time.time()
-    return time_stamp
-
-
-def get_memory_states() -> Tuple[float]:
-    """
-    Return the memory statistics.
-
-    Returns:
-        max_allocated (float): the allocated CUDA memory
-        max_cached (float):  the cached CUDA memory
-    """
-
-    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
-    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.empty_cache()
-    return max_allocated, max_cached
-
-
-def find_all_configs(device_cnt: int) -> List[Dict]:
-    """
-    Find all possible configurations for tensor parallelism
-
-    Args:
-        device_cnt (int): the number of devices
-
-    Returns:
-        config_list (List[Dict]): a list of configurations
-    """
-
-    def _is_square(num):
-        # 2D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(math.sqrt(num))**2 == num
-
-    def _is_cube(num):
-        # 3D parallel should be implemented with at least 2 devices.
-        if num <= 1:
-            return False
-        return math.floor(num**(1. / 3.))**3 == num
-
-    config_list = []
-
-    # add non-parallel config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
-    config_list.append(config)
-
-    # add 1D config
-    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
-    config_list.append(config)
-
-    # add 2D config only if device_cnt is a square
-    if _is_square(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
-        config_list.append(config)
-
-    # check for 2.5D
-    # iterate over depth
-    for depth in range(1, device_cnt):
-        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
-            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
-            config_list.append(config)
-
-    # check for 3D if device_cnt is a cube
-    if _is_cube(device_cnt):
-        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
-        config_list.append(config)
-
-    config_list = [Config(cfg) for cfg in config_list]
-    return config_list
-
-
-def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
-                  timer: MultiTimer) -> Tuple[float]:
-    """
-    Profile the forward and backward of a model
-
-    Args:
-        model (torch.nn.Module): a PyTorch model
-        warmup_steps (int): the number of steps for warmup
-        profile_steps (int): the number of steps for profiling
-        data_func (Callable): a function to generate random data
-        timer (colossalai.utils.Multitimer): a timer instance for time recording
-
-    Returns:
-        fwd_time (float): the average forward time taken by forward pass in second
-        bwd_time (float): the average backward time taken by forward pass in second
-        max_allocated (float): the maximum GPU memory allocated in GB
-        max_cached (float): the maximum GPU memory cached in GB
-    """
-
-    def _run_step(data):
-        timer.start('forward')
-        out = model(data)
-        timer.stop('forward', keep_in_history=True)
-        timer.start('backward')
-        out.mean().backward()
-        timer.stop('backward', keep_in_history=True)
-
-    data_list = [data_func() for _ in range(warmup_steps)]
-    for data in data_list:
-        _run_step(data)
-    timer.reset('forward')
-    timer.reset('backward')
-
-    for _ in range(profile_steps):
-        data = data_func()
-        _run_step(data)
-
-    max_allocated, max_cached = get_memory_states()
-    fwd_time = timer.get_timer('forward').get_history_mean()
-    bwd_time = timer.get_timer('backward').get_history_mean()
-    return fwd_time, bwd_time, max_allocated, max_cached
-
-
-def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
-    """
-    Return a random data of shape (batch_size, seq_length, dim) for profiling.
-
-    Args:
-        dim (int): hidden size
-        batch_size (int): the number of data samples
-        seq_length (int): the number of tokens
-        mode (ParallelMode): Colossal-AI ParallelMode enum
-
-    Returns:
-        data (torch.Tensor): random data
-    """
-
-    if mode in ['2d', '2.5d']:
-        batch_size = batch_size // 2
-        dim = dim // 2
-    elif mode == '3d':
-        batch_size = batch_size // 4
-        dim = dim // 2
-
-    data = torch.rand(batch_size, seq_length, dim).cuda()
-    return data
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
@ -1,6 +1,5 @@
 import click

-from .benchmark import benchmark
 from .check import check
 from .launcher import run

@ -19,7 +18,6 @@ def cli():

 cli.add_command(run)
 cli.add_command(check)
-cli.add_command(benchmark)

 if __name__ == '__main__':
    cli()
--- a/colossalai/context/init.py
+++ b/colossalai/context/init.py
@ -1,6 +1,8 @@
 from .config import Config, ConfigException
-from .parallel_context import ParallelContext
-from .parallel_mode import ParallelMode
-from .moe_context import MOE_CONTEXT
-from .process_group_initializer import *
-from .random import *
+
+# from .moe_context import MOE_CONTEXT
+
+__all__ = [
+    'Config',
+    'ConfigException',
+]
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@ -3,13 +3,12 @@ from typing import Tuple
 import torch
 import torch.distributed as dist

-from colossalai.context.parallel_mode import ParallelMode
 from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.tensor import ProcessGroup
+from colossalai.legacy.tensor import ProcessGroup


 def _check_sanity():
-    from colossalai.core import global_context as gpc
+    from colossalai.legacy.core import global_context as gpc
    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
        raise NotImplementedError("Moe is not compatible with tensor or "
                                  "pipeline parallel at present.")
@ -61,7 +60,7 @@ class MoeContext(metaclass=SingletonMeta):

        self.world_size = dist.get_world_size()

-        from colossalai.core import global_context as gpc
+        from colossalai.legacy.core import global_context as gpc
        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
        assert self.world_size % self.max_ep_size == 0, \
            "Maximum expert parallel size must be a factor of the number of GPUs"
--- a/colossalai/core.py
+++ b/colossalai/core.py
@ -1,6 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from colossalai.context.parallel_context import global_context
-
-__all__ = ['global_context']
--- a/colossalai/fx/passes/shard_1d_pass.py
+++ b/colossalai/fx/passes/shard_1d_pass.py
@ -1,9 +1,11 @@
+import operator
+
 import torch
 import torch.nn as nn
-import operator
-from colossalai.tensor import ProcessGroup
-from colossalai.tensor.distspec import ShardSpec
-from colossalai.tensor.compute_spec import ComputePattern, ComputeSpec
+
+from colossalai.legacy.tensor import ProcessGroup
+from colossalai.legacy.tensor.compute_spec import ComputePattern, ComputeSpec
+from colossalai.legacy.tensor.distspec import ShardSpec

 ELEMENTWISE_MODULE_OP = [torch.nn.Dropout, torch.nn.ReLU]
 ELEMENTWISE_FUNC_OP = [
@ -13,7 +15,7 @@ ELEMENTWISE_FUNC_OP = [


 def weight_split(weight: torch.nn.parameter.Parameter, dim: int, col_normal: bool) -> torch.nn.parameter.Parameter:
-    """weight_split 
+    """weight_split
    split a nn.Parameter

    Args:
@ -60,9 +62,9 @@ def row_shard_linear_pass(gm: torch.fx.GraphModule):

 def transformer_mlp_pass(graph_module: torch.fx.GraphModule, process_group: ProcessGroup):
    """
-    This IR pass checks for transformer MLP like structure and annotate column and row sharding to the linear layers. 
+    This IR pass checks for transformer MLP like structure and annotate column and row sharding to the linear layers.
    """
-    #TODO: Needs to handle special cases, like x = linear(x) + linear(x)
+    # TODO: Needs to handle special cases, like x = linear(x) + linear(x)
    graph = graph_module.graph
    world_size = process_group.world_size()

--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@ -1,58 +1,17 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-import argparse
 import os
-import pprint
+import warnings
 from pathlib import Path
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Dict, Union

 import torch
-import torch.nn as nn
-from torch.nn.modules.loss import _Loss
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.optim.optimizer import Optimizer
-from torch.utils.data import DataLoader
+import torch.distributed as dist

-from colossalai.amp import AMP_TYPE, convert_to_amp
-from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.context import Config, ConfigException, ParallelMode
-from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.core import global_context as gpc
-from colossalai.legacy.builder.builder import build_gradient_handler
-from colossalai.legacy.engine import Engine
-from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
-from colossalai.legacy.engine.schedule import (
-    InterleavedPipelineSchedule,
-    NonPipelineSchedule,
-    PipelineSchedule,
-    get_tensor_shape,
-)
+from colossalai.context import Config
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer
-from colossalai.utils import get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
-from colossalai.utils.moe import sync_moe_model_param
-from colossalai.zero.legacy import ShardedOptimizerV2, convert_to_zero_v2
-from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
-
-
-def get_default_parser():
-    """Reads user command line and uses an argument parser to parse the input arguments.
-    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
-
-    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', type=str, help='path to the config file')
-    parser.add_argument('--host', type=str, help='the master address for distributed training')
-    parser.add_argument('--port', type=int, help='the master port for distributed training')
-    parser.add_argument('--world_size', type=int, help='world size for distributed training')
-    parser.add_argument('--rank', type=int, help='rank for the default process group')
-    parser.add_argument('--local_rank', type=int, help='local rank on the node')
-    parser.add_argument('--backend', type=str, default='nccl', help='backend for distributed communication')
-    return parser
+from colossalai.utils import set_device, set_seed


 def launch(config: Union[str, Path, Config, Dict],
@ -83,40 +42,23 @@ def launch(config: Union[str, Path, Config, Dict],
    Raises:
        Exception: Raise exception when config type is wrong
    """
-    gpc.verbose = verbose
-
-    # set config
-    assert isinstance(config, (Config, str, Path, dict)), \
-        f'expected argument config to be Config, str or Path, but got {type(config)}'
-    if not isinstance(config, Config) and isinstance(config, dict):
-        config = Config(config)
-    if isinstance(config, (str, Path)):
-        config = Config.from_file(config)
-    gpc.load_config(config)
+    if rank == 0:
+        warnings.warn("`config` is deprecated and will be removed soon.")

    # init default process group
-    gpc.init_global_dist(rank, world_size, backend, host, port)
-
-    # init process groups for different parallel modes from config
-    gpc.init_parallel_groups()
+    init_method = f'tcp://[{host}]:{port}'
+    dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method)

    # set cuda device
    if torch.cuda.is_available():
        # if local rank is not given, calculate automatically
-        gpc.set_device(local_rank)
-
-    # set the number of processes running on the same node
-    gpc.detect_num_processes_on_current_node()
+        set_device(local_rank)

-    gpc.set_seed(seed)
+    set_seed(seed)

    if verbose:
        logger = get_dist_logger()
-        logger.info(
-            f'Distributed environment is initialized, '
-            f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, '
-            f'tensor parallel size: {gpc.tensor_parallel_size}',
-            ranks=[0])
+        logger.info(f'Distributed environment is initialized, world size: {dist.get_world_size()}', ranks=[0])


 def launch_from_slurm(config: Union[str, Path, Config, Dict],
@ -224,247 +166,3 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
           backend=backend,
           seed=seed,
           verbose=verbose)
-
-
-def initialize(model: nn.Module,
-               optimizer: Optimizer,
-               criterion: Optional[_Loss] = None,
-               train_dataloader: Optional[Iterable] = None,
-               test_dataloader: Optional[Iterable] = None,
-               lr_scheduler: Optional[_LRScheduler] = None,
-               ophooks: Optional[List[BaseOpHook]] = None,
-               verbose: bool = True) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
-    """Core function to wrap the essential training components with our functionality based on the config which is
-    loaded into gpc.config.
-
-    Args:
-        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
-        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
-            Your optimizer instance.
-        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
-        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
-        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
-        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
-        verbose (bool, optional): Whether to print logs.
-
-    Returns:
-        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
-            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
-            where only ``engine`` could not be None.
-    """
-    # get logger
-    logger = get_dist_logger()
-    gpc.verbose = verbose
-
-    # get config from gpc
-    config = gpc.config
-
-    # print config
-    if verbose:
-        logger.info(
-            f"\n========== Your Config ========\n"
-            f"{pprint.pformat(gpc.config)}\n"
-            f"================================\n",
-            ranks=[0])
-
-    # cudnn
-    cudnn_benchmark = config.get('cudnn_benchmark', False)
-    cudnn_deterministic = config.get('cudnn_deterministic', False)
-    torch.backends.cudnn.benchmark = cudnn_benchmark
-    torch.backends.cudnn.deterministic = cudnn_deterministic
-    if verbose:
-        logger.info(f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0])
-
-    # zero
-    use_zero = hasattr(gpc.config, 'zero')
-    if use_zero:
-        zero_cfg = gpc.config.get('zero', None)
-        if zero_cfg is not None:
-            cfg_ = zero_cfg.copy()
-        else:
-            cfg_ = {}
-        optimizer_config = zero_cfg.get('optimizer_config', None)
-        model_config = zero_cfg.get('model_config', None)
-        model, optimizer = convert_to_zero_v2(model,
-                                              optimizer,
-                                              model_config=model_config,
-                                              optimizer_config=optimizer_config)
-
-        logger.info("Initializing ZeRO model and optimizer finished!", ranks=[0])
-    else:
-        if isinstance(model, nn.Module):
-            # first sync model across dp ranks
-            model.to(get_current_device())
-        elif isinstance(model, Callable):
-            model = model().to(get_current_device())
-
-        # optimizer maybe a optimizer_cls
-        if isinstance(optimizer, Callable):
-            optimizer = optimizer(model.parameters())
-            logger.warning("Initializing an non ZeRO model with optimizer class")
-
-    if not use_zero:
-        if is_using_sequence():
-            sync_model_param(model, ParallelMode.SEQUENCE_DP)
-        elif MOE_CONTEXT.is_initialized:
-            sync_moe_model_param(model)
-        elif is_using_ddp():
-            sync_model_param(model, ParallelMode.DATA)
-    else:
-        logger.warning(
-            "The parameters of models is not automatically synchronized.\n"
-            "Please make sure that all parameters are the same in data parallel group.",
-            ranks=[0])
-
-    # check amp and zero
-    fp16_cfg = gpc.config.get('fp16', None)
-
-    if fp16_cfg is not None and fp16_cfg.mode is not None and use_zero:
-        raise ConfigException(
-            "It is not allowed to set fp16 and zero configuration in your config file at the same time")
-
-    # clip grad norm
-    clip_grad_norm = gpc.config.get('clip_grad_norm', 0.0)
-
-    # initialize amp
-    amp_mode = None
-    if fp16_cfg is not None and fp16_cfg.mode is not None:
-        cfg_ = fp16_cfg.copy()
-        amp_mode = cfg_.pop('mode')
-        if is_using_pp():
-            assert amp_mode == AMP_TYPE.NAIVE, 'Pipeline only support NaiveAMP currently'
-        if amp_mode == AMP_TYPE.NAIVE:
-            cfg_['clip_grad_norm'] = clip_grad_norm
-        model, optimizer, criterion = convert_to_amp(model=model,
-                                                     optimizer=optimizer,
-                                                     criterion=criterion,
-                                                     mode=amp_mode,
-                                                     amp_config=cfg_)
-
-    # get torch ddp config
-    torch_ddp_cfg = gpc.config.get('torch_ddp', dict())
-
-    # gradient handler
-    gradient_handler_cfg = gpc.config.get('gradient_handler', None)
-    if gradient_handler_cfg is None:
-        # if gradient handler is not specified in the configuration file,
-        # check in the following order
-        # 1. if optimizer is ZERO, then use zero grad handler
-        # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp
-        # 3. if using pipeline and dp size larger than 1, use data parallel grad handler
-        if isinstance(optimizer, ShardedOptimizerV2):
-            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Training with zero is detected, ZeROGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        elif is_using_ddp() and MOE_CONTEXT.is_initialized:
-            gradient_handler_cfg = [dict(type='MoeGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Data parallel training is detected with moe parallel, MoeGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        elif is_using_sequence():
-            model = DDP(model,
-                        process_group=gpc.get_group(ParallelMode.SEQUENCE_DP),
-                        device_ids=[torch.cuda.current_device()],
-                        **torch_ddp_cfg)
-            if verbose:
-                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Sequence Parallelism',
-                            ranks=[0])
-        elif is_using_ddp() and not is_using_pp() and amp_mode != AMP_TYPE.NAIVE:
-            model = DDP(model,
-                        process_group=gpc.get_group(ParallelMode.DATA),
-                        device_ids=[torch.cuda.current_device()],
-                        **torch_ddp_cfg)
-            if verbose:
-                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Data Parallelism', ranks=[0])
-        elif is_using_ddp():
-            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
-            if verbose:
-                logger.info(
-                    "Data parallel training is detected when using pipeline parallel, "
-                    "DataParallelGradientHandler is automatically "
-                    "added even though not specified in the configuration",
-                    ranks=[0])
-        # add pipeline parallel gradient handler, if pipeline shared module is detected
-        for param in model.parameters():
-            if getattr(param, 'pipeline_shared_module_pg', None) is not None:
-                if gradient_handler_cfg is None:
-                    gradient_handler_cfg = [dict(type='PipelineSharedModuleGradientHandler')]
-                else:
-                    gradient_handler_cfg.append(dict(type='PipelineSharedModuleGradientHandler'))
-                if verbose:
-                    logger.info(
-                        "pipeline_shared_module is detected, PipelineSharedModuleGradientHandler is automatically "
-                        "added even though not specified in the configuration",
-                        ranks=[0])
-                break
-    else:
-        if not isinstance(gradient_handler_cfg, list):
-            raise ConfigException(
-                f"expected gradient_handler in the configuration file to be a list but got {type(gradient_handler_cfg)}"
-            )
-
-    # turn off sync buffer for NaiveAMPModel if using torch DDP and NaiveAMPModel at the same time
-    # to avoid duplicated buffer synchronization
-    if isinstance(model, DDP) and isinstance(model.module, NaiveAMPModel):
-        model.module.sync_buffer = False
-
-    # initialize schedule for engine
-    if is_using_pp():
-        tensor_shape = get_tensor_shape()
-        use_interleaved = hasattr(gpc.config, 'model') and hasattr(gpc.config.model, 'num_chunks')
-        if gpc.is_initialized(ParallelMode.PARALLEL_1D):
-            scatter_gather = True
-        else:
-            scatter_gather = False
-        if use_interleaved:
-            if isinstance(model, nn.Sequential):
-                model = nn.ModuleList([model])
-            schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
-                                                   gpc.config.model.num_chunks,
-                                                   tensor_shape=tensor_shape,
-                                                   scatter_gather_tensors=scatter_gather)
-        else:
-            schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
-                                        tensor_shape=tensor_shape,
-                                        scatter_gather_tensors=scatter_gather)
-    else:
-        schedule = NonPipelineSchedule()
-
-    if gradient_handler_cfg is None:
-        gradient_handlers = None
-        if verbose and not isinstance(model, DDP):
-            logger.warning(
-                "No PyTorch DDP or gradient handler is set up, please make sure you do not need "
-                "to all-reduce the gradients after a training step.",
-                ranks=[0])
-    else:
-        gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
-
-    # check if optimizer is ColossalaiOptimizer
-    if not isinstance(optimizer, (ColossalaiOptimizer, ShardedOptimizerV2)):
-        optimizer = ColossalaiOptimizer(optim=optimizer)
-
-    # gradient accumulation
-    grad_accum_size = gpc.config.get('gradient_accumulation', None)
-    if grad_accum_size is not None:
-        optimizer, train_dataloader, gradient_handlers, lr_scheduler = accumulate_gradient(
-            model=model,
-            optimizer=optimizer,
-            dataloader=train_dataloader,
-            accumulate_size=grad_accum_size,
-            gradient_handlers=gradient_handlers,
-            lr_scheduler=lr_scheduler)
-    engine = Engine(model=model,
-                    optimizer=optimizer,
-                    criterion=criterion,
-                    gradient_handlers=gradient_handlers,
-                    clip_grad_norm=clip_grad_norm,
-                    ophook_list=ophooks,
-                    schedule=schedule)
-
-    return engine, train_dataloader, test_dataloader, lr_scheduler
--- a/colossalai/legacy/init.py
+++ b/colossalai/legacy/init.py
@ -0,0 +1,9 @@
+from .initialize import initialize, launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
+
+__all__ = [
+    'launch',
+    'launch_from_openmpi',
+    'launch_from_slurm',
+    'launch_from_torch',
+    'initialize',
+]
--- a/colossalai/legacy/amp/init.py
+++ b/colossalai/legacy/amp/init.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch.nn as nn
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
+
+from colossalai.context import Config
+
+from .amp_type import AMP_TYPE
+from .apex_amp import convert_to_apex_amp
+from .naive_amp import convert_to_naive_amp
+from .torch_amp import convert_to_torch_amp
+
+__all__ = ['convert_to_amp', 'convert_to_naive_amp', 'convert_to_apex_amp', 'convert_to_torch_amp', 'AMP_TYPE']
+
+
+def convert_to_amp(model: nn.Module, optimizer: Optimizer, criterion: _Loss, mode: AMP_TYPE, amp_config: Config = None):
+    """A helper function to wrap training components with Torch AMP modules.
+
+    Args:
+        param model (:class:`torch.nn.Module`): your model object.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object.
+        criterion (:class:`torch.nn.modules.loss._Loss`): your loss function object.
+        mode (:class:`colossalai.legacy.amp.AMP_TYPE`): amp mode.
+        amp_config (Union[:class:`colossalai.context.Config`, dict]): configuration for different amp modes.
+
+    Returns:
+        A tuple (model, optimizer, criterion).
+
+    Note:
+        ``amp_config`` may vary from different mode you choose. You should check the corresponding amp mode
+        for more details about ``amp_config``.
+        For ``apex_amp``, please check
+        `apex_amp config <https://nvidia.github.io/apex/amp.html?highlight=apex%20amp>`_.
+        For ``naive_amp``, please check
+        `naive_amp config <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/amp/naive_amp/_fp16_optimizer.py#L42>`_.
+        For ``torch_amp``, please check
+        `torch_amp config <https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py#L97>`_.
+    """
+    assert isinstance(mode, AMP_TYPE), \
+        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
+
+    if amp_config is None:
+        amp_config = Config()
+
+    if mode == AMP_TYPE.TORCH:
+        model, optimizer, criterion = convert_to_torch_amp(model, optimizer, criterion, amp_config)
+    elif mode == AMP_TYPE.APEX:
+        model, optimizer = convert_to_apex_amp(model, optimizer, amp_config)
+    elif mode == AMP_TYPE.NAIVE:
+        model, optimizer = convert_to_naive_amp(model, optimizer, amp_config)
+
+    return model, optimizer, criterion
--- a/colossalai/legacy/amp/amp_type.py
+++ b/colossalai/legacy/amp/amp_type.py
--- a/colossalai/legacy/amp/apex_amp/init.py
+++ b/colossalai/legacy/amp/apex_amp/init.py
--- a/colossalai/legacy/amp/apex_amp/apex_amp.py
+++ b/colossalai/legacy/amp/apex_amp/apex_amp.py
@ -10,11 +10,11 @@ except ImportError:

 from torch import Tensor

-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils import clip_grad_norm_fp32
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.utils import clip_grad_norm_fp32


-class ApexAMPOptimizer(ColossalaiOptimizer):
+class ApexAMPOptimizer(OptimizerWrapper):
    """ A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
    methods
    """
--- a/colossalai/legacy/amp/naive_amp/init.py
+++ b/colossalai/legacy/amp/naive_amp/init.py
@ -0,0 +1,60 @@
+import inspect
+
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from colossalai.amp.naive_amp.grad_scaler import ConstantGradScaler, DynamicGradScaler
+from colossalai.legacy.utils import is_no_pp_or_last_stage
+
+from ._fp16_optimizer import FP16Optimizer
+from .naive_amp import NaiveAMPModel, NaiveAMPOptimizer
+
+
+def convert_to_naive_amp(model: nn.Module, optimizer: Optimizer, amp_config):
+    """A helper function to wrap training components with naive AMP modules. In this mode,
+    we forcibly cast the model weights and inputs to FP16, and cast the model outputs to FP32 to calculate loss,
+    which is equivalent to Apex O3.
+
+    Args:
+        model (:class:`torch.nn.Module`): your model object
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
+        amp_config (:class:`colossalai.context.Config` or dict): configuration for naive mode amp.
+
+    Returns:
+        Tuple: A tuple (model, optimizer)
+
+    The ``amp_config`` should contain parameters below::
+
+        verbose (bool, optional): if set to `True`, will print debug info (Default: False).
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm (Default 0).
+                                          Note that clipping is ignored if clip_grad == 0.
+        dynamic_grad_scale (bool): whether to use dynamic grad scaler.
+    """
+    if isinstance(model, nn.ModuleList):
+        # interleaved pipeline
+        module_list = []
+        for chunk, m in enumerate(model):
+            output_to_fp32 = is_no_pp_or_last_stage() and chunk == len(model) - 1
+            module_list.append(NaiveAMPModel(m, output_to_fp32=output_to_fp32))
+        model = nn.ModuleList(module_list)
+    else:
+        output_to_fp32 = is_no_pp_or_last_stage()
+        model = NaiveAMPModel(model, output_to_fp32=output_to_fp32)
+
+    use_dynamic_grad_scaler = amp_config.pop('dynamic_grad_scale', True)
+    if use_dynamic_grad_scaler:
+        scaler_class = DynamicGradScaler
+    else:
+        scaler_class = ConstantGradScaler
+
+    sig = inspect.signature(scaler_class.__init__)
+    kwargs = dict()
+    for param in sig.parameters.values():
+        if param.name in amp_config:
+            kwargs[param.name] = amp_config.pop(param.name)
+    grad_scaler = scaler_class(**kwargs)
+    optimizer = NaiveAMPOptimizer(optimizer, grad_scaler, **amp_config)
+    return model, optimizer
+
+
+__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer', 'FP16Optimizer']
--- a/colossalai/legacy/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/legacy/amp/naive_amp/_fp16_optimizer.py
@ -6,14 +6,15 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.optim import Optimizer

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.amp.naive_amp.grad_scaler import BaseGradScaler
 from colossalai.kernel.op_builder import FusedOptimBuilder
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes
 from colossalai.logging import get_dist_logger
-from colossalai.utils import clip_grad_norm_fp32, copy_tensor_parallel_attributes, multi_tensor_applier
+from colossalai.utils import multi_tensor_applier

 from ._utils import has_inf_or_nan, zero_gard_by_list
-from .grad_scaler import BaseGradScaler

 try:
    from colossalai._C import fused_optim
--- a/colossalai/legacy/amp/naive_amp/_utils.py
+++ b/colossalai/legacy/amp/naive_amp/_utils.py
--- a/colossalai/legacy/amp/naive_amp/naive_amp.py
+++ b/colossalai/legacy/amp/naive_amp/naive_amp.py
@ -11,14 +11,14 @@ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed import ReduceOp
 from torch.optim import Optimizer

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc

 from ._fp16_optimizer import FP16Optimizer


-class NaiveAMPOptimizer(ColossalaiOptimizer):
+class NaiveAMPOptimizer(OptimizerWrapper):
    """A wrapper class for optimizer to cast all parameters to fp16

    Args:
@ -57,7 +57,7 @@ class NaiveAMPModel(nn.Module):
    Args:
        model (torch.nn.Module): torch.nn.Module to be wrapped.
        output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this module.
                                                                  (Default: ``ParallelMode.DATA``)
        sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)

--- a/colossalai/legacy/amp/torch_amp/init.py
+++ b/colossalai/legacy/amp/torch_amp/init.py
--- a/colossalai/legacy/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/legacy/amp/torch_amp/_grad_scaler.py
@ -13,8 +13,8 @@ import torch.distributed as dist
 from packaging import version
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc


 class _MultiDeviceReplicator(object):
--- a/colossalai/legacy/amp/torch_amp/torch_amp.py
+++ b/colossalai/legacy/amp/torch_amp/torch_amp.py
@ -7,13 +7,13 @@ from torch import Tensor
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer

-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils import clip_grad_norm_fp32
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.utils import clip_grad_norm_fp32

 from ._grad_scaler import GradScaler


-class TorchAMPOptimizer(ColossalaiOptimizer):
+class TorchAMPOptimizer(OptimizerWrapper):
    """A wrapper class which integrate Pytorch AMP with an optimizer

    Args:
--- a/colossalai/legacy/communication/collective.py
+++ b/colossalai/legacy/communication/collective.py
@ -6,8 +6,8 @@ import torch.distributed as dist
 from torch import Tensor
 from torch.distributed import ReduceOp

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc

 _all_gather_func = dist._all_gather_base \
    if "all_gather_into_tensor" not in dir(dist) else dist.all_gather_into_tensor
@ -26,7 +26,7 @@ def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op:
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be gathered.
        dim (int): The dimension concatenating in.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
        async_op (bool, optional): Whether operations are asynchronous.

    Returns:
@ -65,7 +65,7 @@ def reduce_scatter(tensor: Tensor,
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
        dim (int): The dimension concatenating in.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
            More details about ReduceOp please refer to
@ -105,7 +105,7 @@ def all_reduce(tensor: Tensor,

    Args:
        tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
            More details about ReduceOp please refer to
@ -141,7 +141,7 @@ def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: b
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be broadcast.
        src (int): Source rank.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
        async_op (bool, optional): Whether operations are asynchronous.

    Returns:
@ -173,7 +173,7 @@ def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp =
    Args:
        tensor (:class:`torch.Tensor`): Tensor to be reduced.
        dst (int): Destination rank.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): Parallel group mode used in this communication.
        async_op (bool, optional): Whether operations are asynchronous.

    Returns:
--- a/colossalai/legacy/communication/p2p.py
+++ b/colossalai/legacy/communication/p2p.py
@ -8,8 +8,8 @@ from typing import List, Tuple, Union
 import torch
 import torch.distributed as dist

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device

 from .utils import gather_split_1d_tensor, split_tensor_into_1d_equal_chunks
--- a/colossalai/legacy/communication/p2p_v2.py
+++ b/colossalai/legacy/communication/p2p_v2.py
@ -10,8 +10,8 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroupNCCL
 from torch.distributed import distributed_c10d as c10d

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc

 TensorShape = Union[torch.Size, List[int], Tuple[int]]
 _pg_manager = {}
--- a/colossalai/legacy/communication/ring.py
+++ b/colossalai/legacy/communication/ring.py
@ -3,8 +3,8 @@

 import torch

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device, synchronize


--- a/colossalai/legacy/communication/utils.py
+++ b/colossalai/legacy/communication/utils.py
@ -3,8 +3,8 @@ from typing import List, Tuple, Union
 import torch
 import torch.distributed as dist

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device

 TensorShape = Union[torch.Size, List[int], Tuple[int]]
--- a/colossalai/legacy/constants.py
+++ b/colossalai/legacy/constants.py
--- a/colossalai/legacy/context/init.py
+++ b/colossalai/legacy/context/init.py
@ -0,0 +1,4 @@
+from .parallel_context import ParallelContext
+from .parallel_mode import ParallelMode
+from .process_group_initializer import *
+from .random import *
--- a/colossalai/legacy/context/parallel_context.py
+++ b/colossalai/legacy/context/parallel_context.py
@ -11,10 +11,10 @@ import numpy as np
 import torch
 import torch.distributed as dist

-from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
 from colossalai.context.config import Config
 from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.constants import ALLOWED_MODES, INITIALIZER_MAPPING
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER
 from colossalai.logging import get_dist_logger

@ -110,12 +110,12 @@ class ParallelContext(metaclass=SingletonMeta):
        """Adds the global rank of the current device for `parallel_mode` to the context.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode for the rank.
            rank (int): The rank to be added

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._global_ranks[parallel_mode] = rank
@ -124,11 +124,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the local rank of the current device.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            int: The local rank of the current device for `parallel_mode`.
@ -140,12 +140,12 @@ class ParallelContext(metaclass=SingletonMeta):
        """Adds the local rank of the current device for `parallel_mode` to the context.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode for the rank.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode for the rank.
            rank (int): The rank to be added.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._local_ranks[parallel_mode] = rank
@ -154,11 +154,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the global rank of the next device.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            int: The global rank of the next device for `parallel_mode`.
@ -176,11 +176,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the global rank of the previous device.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            int: The global rank of the previous device for `parallel_mode`.
@ -199,11 +199,11 @@ class ParallelContext(metaclass=SingletonMeta):
        among its group for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            bool: a boolean value indicating whether the current device is the first one
@ -217,11 +217,11 @@ class ParallelContext(metaclass=SingletonMeta):
        among its group for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            bool: a boolean value indicating whether the current device is the first one
@ -248,11 +248,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the world size for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            int: The world size for `parallel_mode`.
@ -264,12 +264,12 @@ class ParallelContext(metaclass=SingletonMeta):
        """Adds world size for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode corresponding to the process group
            world_size (int): The world size to be added

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._world_sizes[parallel_mode] = world_size
@ -278,11 +278,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the group of the current device for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            torch.distributed.ProcessGroup: The group of the current device for `parallel_mode`.
@ -294,12 +294,12 @@ class ParallelContext(metaclass=SingletonMeta):
        """Adds the group of the current device for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
            group (torch.distributed.ProcessGroup): The group to be added

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._groups[parallel_mode] = group
@ -308,9 +308,9 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the Gloo group of the current device for `parallel_mode`.

        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :type parallel_mode: :class:`colossalai.legacy.context.ParallelMode`
        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+            of :class:`colossalai.legacy.context.ParallelMode`
        :return: The group of the current device for `parallel_mode`
        :rtype: torch.distributed.ProcessGroup
        """
@ -321,11 +321,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Adds the Gloo group of the current device for `parallel_mode`.

        :param parallel_mode: The chosen parallel mode
-        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :type parallel_mode: :class:`colossalai.legacy.context.ParallelMode`
        :param group: The group to be added
        :type group: torch.distributed.ProcessGroup
        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-            of :class:`colossalai.context.ParallelMode`
+            of :class:`colossalai.legacy.context.ParallelMode`
        """
        self._check_parallel_mode(parallel_mode)
        self._cpu_groups[parallel_mode] = group
@ -334,11 +334,11 @@ class ParallelContext(metaclass=SingletonMeta):
        """Returns the rank of the current device for `parallel_mode` in the group.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.

        Returns:
            int: The rank of the current device for `parallel_mode` in the group.
@ -350,12 +350,12 @@ class ParallelContext(metaclass=SingletonMeta):
        """Adds the ranks of the current device for `parallel_mode` in the group.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
            ranks (list): List of ranks to be added

        Raises:
            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
-                of :class:`colossalai.context.ParallelMode`.
+                of :class:`colossalai.legacy.context.ParallelMode`.
        """
        self._check_parallel_mode(parallel_mode)
        self._ranks_in_group[parallel_mode] = ranks
@ -489,7 +489,7 @@ class ParallelContext(metaclass=SingletonMeta):
        in the current system.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

        Returns:
            bool: a boolean value indicating whether `parallel_mode` is initialized in the current system.
--- a/colossalai/legacy/context/parallel_mode.py
+++ b/colossalai/legacy/context/parallel_mode.py
--- a/colossalai/legacy/context/process_group_initializer/init.py
+++ b/colossalai/legacy/context/process_group_initializer/init.py
@ -3,10 +3,10 @@ from .initializer_2d import Initializer_2D
 from .initializer_2p5d import Initializer_2p5D
 from .initializer_3d import Initializer_3D
 from .initializer_data import Initializer_Data
+from .initializer_model import Initializer_Model
 from .initializer_pipeline import Initializer_Pipeline
 from .initializer_sequence import Initializer_Sequence
 from .initializer_tensor import Initializer_Tensor
-from .initializer_model import Initializer_Model
 from .process_group_initializer import ProcessGroupInitializer

 __all__ = [
--- a/colossalai/legacy/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_1d.py
@ -3,7 +3,7 @@

 import torch.distributed as dist

-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER

 from ..parallel_mode import ParallelMode
--- a/colossalai/legacy/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_2d.py
@ -2,7 +2,7 @@ import math

 import torch.distributed as dist

-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER

 from ..parallel_mode import ParallelMode
--- a/colossalai/legacy/context/process_group_initializer/initializer_2p5d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_2p5d.py
@ -6,7 +6,7 @@ import math
 import torch.distributed as dist

 from colossalai.context import Config
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER

 from ..parallel_mode import ParallelMode
--- a/colossalai/legacy/context/process_group_initializer/initializer_3d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_3d.py
@ -5,7 +5,7 @@ import math

 import torch.distributed as dist

-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import DIST_GROUP_INITIALIZER

 from ..parallel_mode import ParallelMode
--- a/colossalai/legacy/context/process_group_initializer/initializer_data.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_data.py
--- a/colossalai/legacy/context/process_group_initializer/initializer_model.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_model.py
--- a/colossalai/legacy/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_pipeline.py
--- a/colossalai/legacy/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_sequence.py
--- a/colossalai/legacy/context/process_group_initializer/initializer_tensor.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_tensor.py
--- a/colossalai/legacy/context/process_group_initializer/process_group_initializer.py
+++ b/colossalai/legacy/context/process_group_initializer/process_group_initializer.py
--- a/colossalai/legacy/context/random/init.py
+++ b/colossalai/legacy/context/random/init.py
--- a/colossalai/legacy/context/random/_helper.py
+++ b/colossalai/legacy/context/random/_helper.py
@ -7,8 +7,8 @@ from contextlib import contextmanager
 import torch.cuda
 from torch import Tensor

-from .seed_manager import SeedManager
 from ..parallel_mode import ParallelMode
+from .seed_manager import SeedManager

 _SEED_MANAGER = SeedManager()

@ -53,11 +53,11 @@ def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
    """Adds a seed to the seed manager for `parallel_mode`.

    Args:
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
        seed (int): The seed to be added
    Raises:
        AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
-            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
+            :class:`colossalai.legacy.context.ParallelMode` or the seed for `parallel_mode` has been added.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -70,7 +70,7 @@ def set_mode(parallel_mode: ParallelMode):
    """Sets the current mode of the seed manager.

    Args:
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -83,7 +83,7 @@ def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
    """Sets the state of the seed manager for `parallel_mode`.

    Args:
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
        state (:class:`torch.Tensor`): the state to be set.

    Raises:
@ -161,7 +161,7 @@ def with_seed(func, parallel_mode: ParallelMode):

 def moe_set_seed(seed):
    if torch.cuda.is_available():
-        from colossalai.core import global_context as gpc
+        from colossalai.legacy.core import global_context as gpc
        global_rank = gpc.get_global_rank()
        diff_seed = seed + global_rank
        add_seed(ParallelMode.TENSOR, diff_seed, True)
--- a/colossalai/legacy/context/random/seed_manager.py
+++ b/colossalai/legacy/context/random/seed_manager.py
@ -4,7 +4,7 @@
 import torch
 from torch import Tensor

-from colossalai.context.parallel_mode import ParallelMode
+from colossalai.legacy.context.parallel_mode import ParallelMode


 class SeedManager:
@ -36,7 +36,7 @@ class SeedManager:
        """Sets the state of the seed manager for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
            state (:class:`torch.Tensor`): the state to be set.

        Raises:
@ -49,7 +49,7 @@ class SeedManager:
        """Sets the current mode of the seed manager.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
        """
        if self.current_mode:
            # save the current state for current mode
@ -63,12 +63,12 @@ class SeedManager:
        """Adds a seed to the seed manager for `parallel_mode`.

        Args:
-            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.
            seed (int): The seed to be added.
            overwrite (bool, optional): Whether allows to overwrite the seed that has been set already

        Raises:
-            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.legacy.context.ParallelMode`
                or the seed for `parallel_mode` has been added.
        """
        assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
--- a/colossalai/legacy/core.py
+++ b/colossalai/legacy/core.py
@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from colossalai.legacy.context.parallel_context import global_context
+
+__all__ = ['global_context']
--- a/colossalai/legacy/engine/_base_engine.py
+++ b/colossalai/legacy/engine/_base_engine.py
@ -8,6 +8,7 @@ from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss

+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 from colossalai.legacy.engine.schedule import (
    BaseSchedule,
@ -15,9 +16,8 @@ from colossalai.legacy.engine.schedule import (
    NonPipelineSchedule,
    PipelineSchedule,
 )
+from colossalai.legacy.zero.gemini import BaseOpHook, register_ophooks_recursively
 from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively


 class Engine:
@ -27,7 +27,7 @@ class Engine:

    Args:
        model (``torch.nn.Module``): The neural network model.
-        optimizer (``colossalai.nn.optimizer.ColossalaiOptimizer``): Optimizer for updating the parameters.
+        optimizer (``colossalai.interface.OptimizerWrapper``): Optimizer for updating the parameters.
        criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
        gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
        clip_grad_norm (float, optional): The norm of gradient clipping.
@ -61,7 +61,7 @@ class Engine:

    def __init__(self,
                 model: Module,
-                 optimizer: "ColossalaiOptimizer",
+                 optimizer: "OptimizerWrapper",
                 criterion: Optional[_Loss] = None,
                 gradient_handlers: Optional[List[BaseGradientHandler]] = None,
                 clip_grad_norm: float = 0.0,
@ -157,7 +157,7 @@ class Engine:
        """Execute parameter update
        """
        self._all_reduce_gradients()
-        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
+        self.optimizer.clip_grad_by_norm(self._clip_grad_norm)
        return self.optimizer.step()

    def backward(self, loss: Tensor):
--- a/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
@ -10,12 +10,12 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader

+from colossalai.interface import OptimizerWrapper
 from colossalai.legacy.engine import BaseGradientHandler
-from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import conditional_context


-class GradAccumOptimizer(ColossalaiOptimizer):
+class GradAccumOptimizer(OptimizerWrapper):
    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps
    before accumulation size is reached.

@ -74,7 +74,7 @@ class GradAccumOptimizer(ColossalaiOptimizer):
        if self.accumulate_step < self.accumulate_size:
            pass
        else:
-            self.optim.clip_grad_norm(model, max_norm)
+            self.optim.clip_grad_by_norm(max_norm)

    def backward(self, loss: Tensor) -> None:
        """Execute backward pass.
--- a/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
@ -1,5 +1,5 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER

 from ._base_gradient_handler import BaseGradientHandler
--- a/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
@ -1,6 +1,6 @@
 from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER
 from colossalai.utils.moe import get_moe_epsize_param_dict

--- a/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@ -6,7 +6,7 @@ import torch
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER

 from ._base_gradient_handler import BaseGradientHandler
--- a/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
@ -1,5 +1,5 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import GRADIENT_HANDLER

 from ._base_gradient_handler import BaseGradientHandler
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@ -7,11 +7,11 @@ from typing import Callable, List, Tuple, Union
 import torch.cuda

 import colossalai.legacy.communication as comm
-from colossalai.amp.naive_amp import NaiveAMPModel
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.amp.naive_amp import NaiveAMPModel
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.logging import get_dist_logger
-from colossalai.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.utils.cuda import get_current_device

 from ._base_schedule import BaseSchedule
@ -157,7 +157,7 @@ class PipelineSchedule(BaseSchedule):
        return self._move_to_device(micro_batch_data)

    def pre_processing(self, engine):
-        from colossalai.zero.legacy import ShardedModelV2
+        from colossalai.legacy.zero import ShardedModelV2

        # TODO: remove this after testing new zero with pipeline parallelism
        model = engine.model
--- a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
@ -6,8 +6,8 @@ from typing import Iterable, Tuple
 import torch.cuda

 import colossalai.legacy.communication.p2p_v2 as comm
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.engine import Engine
 from colossalai.utils.cuda import get_current_device

--- a/colossalai/legacy/global_variables.py
+++ b/colossalai/legacy/global_variables.py
--- a/colossalai/legacy/initialize.py
+++ b/colossalai/legacy/initialize.py
@ -0,0 +1,472 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import argparse
+import os
+import pprint
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.loss import _Loss
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader
+
+from colossalai.context import Config, ConfigException
+from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.interface import OptimizerWrapper
+from colossalai.legacy.amp import AMP_TYPE, convert_to_amp
+from colossalai.legacy.amp.naive_amp import NaiveAMPModel
+from colossalai.legacy.builder.builder import build_gradient_handler
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.engine import Engine
+from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
+from colossalai.legacy.engine.schedule import (
+    InterleavedPipelineSchedule,
+    NonPipelineSchedule,
+    PipelineSchedule,
+    get_tensor_shape,
+)
+from colossalai.legacy.utils import is_using_ddp, is_using_pp, is_using_sequence, sync_model_param
+from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
+from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
+from colossalai.logging import get_dist_logger
+from colossalai.utils import get_current_device
+from colossalai.utils.moe import sync_moe_model_param
+
+
+def get_default_parser():
+    """Reads user command line and uses an argument parser to parse the input arguments.
+    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
+
+    Returns:
+       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', type=str, help='path to the config file')
+    parser.add_argument('--host', type=str, help='the master address for distributed training')
+    parser.add_argument('--port', type=int, help='the master port for distributed training')
+    parser.add_argument('--world_size', type=int, help='world size for distributed training')
+    parser.add_argument('--rank', type=int, help='rank for the default process group')
+    parser.add_argument('--local_rank', type=int, help='local rank on the node')
+    parser.add_argument('--backend', type=str, default='nccl', help='backend for distributed communication')
+    return parser
+
+
+def launch(config: Union[str, Path, Config, Dict],
+           rank: int,
+           world_size: int,
+           host: str,
+           port: int,
+           backend: str = 'nccl',
+           local_rank: int = None,
+           seed: int = 1024,
+           verbose: bool = True):
+    """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
+    arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        rank (int): Rank for the default process group
+        world_size (int): World size of the default process group
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        local_rank (int, optional):
+            Rank for the process on the node and is used to set the default CUDA device,
+            defaults to None. If local_rank = None, the default device ordinal will be calculated automatically.
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+
+    Raises:
+        Exception: Raise exception when config type is wrong
+    """
+    gpc.verbose = verbose
+
+    # set config
+    assert isinstance(config, (Config, str, Path, dict)), \
+        f'expected argument config to be Config, str or Path, but got {type(config)}'
+    if not isinstance(config, Config) and isinstance(config, dict):
+        config = Config(config)
+    if isinstance(config, (str, Path)):
+        config = Config.from_file(config)
+    gpc.load_config(config)
+
+    # init default process group
+    gpc.init_global_dist(rank, world_size, backend, host, port)
+
+    # init process groups for different parallel modes from config
+    gpc.init_parallel_groups()
+
+    # set cuda device
+    if torch.cuda.is_available():
+        # if local rank is not given, calculate automatically
+        gpc.set_device(local_rank)
+
+    # set the number of processes running on the same node
+    gpc.detect_num_processes_on_current_node()
+
+    gpc.set_seed(seed)
+
+    if verbose:
+        logger = get_dist_logger()
+        logger.info(
+            f'Distributed environment is initialized, '
+            f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, '
+            f'tensor parallel size: {gpc.tensor_parallel_size}',
+            ranks=[0])
+
+
+def launch_from_slurm(config: Union[str, Path, Config, Dict],
+                      host: str,
+                      port: int,
+                      backend: str = 'nccl',
+                      seed: int = 1024,
+                      verbose: bool = True):
+    """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
+    set by SLURM
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+    """
+    try:
+        rank = int(os.environ['SLURM_PROCID'])
+        world_size = int(os.environ['SLURM_NPROCS'])
+    except KeyError as e:
+        raise RuntimeError(
+            f"Could not find {e} in the SLURM environment, visit https://www.colossalai.org/ for more information on launching with SLURM"
+        )
+
+    launch(config=config,
+           rank=rank,
+           world_size=world_size,
+           host=host,
+           port=port,
+           backend=backend,
+           seed=seed,
+           verbose=verbose)
+
+
+def launch_from_openmpi(config: Union[str, Path, Config, Dict],
+                        host: str,
+                        port: int,
+                        backend: str = 'nccl',
+                        seed: int = 1024,
+                        verbose: bool = True):
+    """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
+    set by OpenMPI
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        host (str): The master address for distributed training
+        port (str): The master port for distributed training
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+    """
+    try:
+        rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+        local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+    except KeyError as e:
+        raise RuntimeError(
+            f"Could not find {e} in the OpenMPI environment, visit https://www.colossalai.org/ for more information on launching with OpenMPI"
+        )
+
+    launch(config=config,
+           local_rank=local_rank,
+           rank=rank,
+           world_size=world_size,
+           host=host,
+           port=port,
+           backend=backend,
+           seed=seed,
+           verbose=verbose)
+
+
+def launch_from_torch(config: Union[str, Path, Config, Dict],
+                      backend: str = 'nccl',
+                      seed: int = 1024,
+                      verbose: bool = True):
+    """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
+    from the environment variables set by PyTorch
+
+    Args:
+        config (Union[str, dict, Config]): Config file or config file path are both acceptable
+        backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl``
+        seed (int, optional): Specified random seed for every process. Defaults to 1024.
+        verbose (bool, optional): Whether to print logs. Defaults to True.
+    """
+    try:
+        rank = int(os.environ['RANK'])
+        local_rank = int(os.environ['LOCAL_RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        host = os.environ['MASTER_ADDR']
+        port = int(os.environ['MASTER_PORT'])
+    except KeyError as e:
+        raise RuntimeError(
+            f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+        )
+
+    launch(config=config,
+           local_rank=local_rank,
+           rank=rank,
+           world_size=world_size,
+           host=host,
+           port=port,
+           backend=backend,
+           seed=seed,
+           verbose=verbose)
+
+
+def initialize(model: nn.Module,
+               optimizer: Optimizer,
+               criterion: Optional[_Loss] = None,
+               train_dataloader: Optional[Iterable] = None,
+               test_dataloader: Optional[Iterable] = None,
+               lr_scheduler: Optional[_LRScheduler] = None,
+               ophooks: Optional[List[BaseOpHook]] = None,
+               verbose: bool = True) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
+    """Core function to wrap the essential training components with our functionality based on the config which is
+    loaded into gpc.config.
+
+    Args:
+        model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
+        optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
+            Your optimizer instance.
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
+        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
+        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
+        lr_scheduler (:class:`torch.nn.lr_scheduler._LRScheduler`, optional): Your lr scheduler instance, optional.
+        verbose (bool, optional): Whether to print logs.
+
+    Returns:
+        Tuple (engine, train_dataloader, test_dataloader, lr_scheduler):
+            A tuple of ``(engine, train_dataloader, test_dataloader, lr_scheduler)``
+            where only ``engine`` could not be None.
+    """
+    # get logger
+    logger = get_dist_logger()
+    gpc.verbose = verbose
+
+    # get config from gpc
+    config = gpc.config
+
+    # print config
+    if verbose:
+        logger.info(
+            f"\n========== Your Config ========\n"
+            f"{pprint.pformat(gpc.config)}\n"
+            f"================================\n",
+            ranks=[0])
+
+    # cudnn
+    cudnn_benchmark = config.get('cudnn_benchmark', False)
+    cudnn_deterministic = config.get('cudnn_deterministic', False)
+    torch.backends.cudnn.benchmark = cudnn_benchmark
+    torch.backends.cudnn.deterministic = cudnn_deterministic
+    if verbose:
+        logger.info(f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0])
+
+    # zero
+    use_zero = hasattr(gpc.config, 'zero')
+    if use_zero:
+        zero_cfg = gpc.config.get('zero', None)
+        if zero_cfg is not None:
+            cfg_ = zero_cfg.copy()
+        else:
+            cfg_ = {}
+        optimizer_config = zero_cfg.get('optimizer_config', None)
+        model_config = zero_cfg.get('model_config', None)
+        model, optimizer = convert_to_zero_v2(model,
+                                              optimizer,
+                                              model_config=model_config,
+                                              optimizer_config=optimizer_config)
+
+        logger.info("Initializing ZeRO model and optimizer finished!", ranks=[0])
+    else:
+        if isinstance(model, nn.Module):
+            # first sync model across dp ranks
+            model.to(get_current_device())
+        elif isinstance(model, Callable):
+            model = model().to(get_current_device())
+
+        # optimizer maybe a optimizer_cls
+        if isinstance(optimizer, Callable):
+            optimizer = optimizer(model.parameters())
+            logger.warning("Initializing an non ZeRO model with optimizer class")
+
+    if not use_zero:
+        if is_using_sequence():
+            sync_model_param(model, ParallelMode.SEQUENCE_DP)
+        elif MOE_CONTEXT.is_initialized:
+            sync_moe_model_param(model)
+        elif is_using_ddp():
+            sync_model_param(model, ParallelMode.DATA)
+    else:
+        logger.warning(
+            "The parameters of models is not automatically synchronized.\n"
+            "Please make sure that all parameters are the same in data parallel group.",
+            ranks=[0])
+
+    # check amp and zero
+    fp16_cfg = gpc.config.get('fp16', None)
+
+    if fp16_cfg is not None and fp16_cfg.mode is not None and use_zero:
+        raise ConfigException(
+            "It is not allowed to set fp16 and zero configuration in your config file at the same time")
+
+    # clip grad norm
+    clip_grad_norm = gpc.config.get('clip_grad_norm', 0.0)
+
+    # initialize amp
+    amp_mode = None
+    if fp16_cfg is not None and fp16_cfg.mode is not None:
+        cfg_ = fp16_cfg.copy()
+        amp_mode = cfg_.pop('mode')
+        if is_using_pp():
+            assert amp_mode == AMP_TYPE.NAIVE, 'Pipeline only support NaiveAMP currently'
+        if amp_mode == AMP_TYPE.NAIVE:
+            cfg_['clip_grad_norm'] = clip_grad_norm
+        model, optimizer, criterion = convert_to_amp(model=model,
+                                                     optimizer=optimizer,
+                                                     criterion=criterion,
+                                                     mode=amp_mode,
+                                                     amp_config=cfg_)
+
+    # get torch ddp config
+    torch_ddp_cfg = gpc.config.get('torch_ddp', dict())
+
+    # gradient handler
+    gradient_handler_cfg = gpc.config.get('gradient_handler', None)
+    if gradient_handler_cfg is None:
+        # if gradient handler is not specified in the configuration file,
+        # check in the following order
+        # 1. if optimizer is ZERO, then use zero grad handler
+        # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp
+        # 3. if using pipeline and dp size larger than 1, use data parallel grad handler
+        if isinstance(optimizer, ShardedOptimizerV2):
+            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
+            if verbose:
+                logger.info(
+                    "Training with zero is detected, ZeROGradientHandler is automatically "
+                    "added even though not specified in the configuration",
+                    ranks=[0])
+        elif is_using_ddp() and MOE_CONTEXT.is_initialized:
+            gradient_handler_cfg = [dict(type='MoeGradientHandler')]
+            if verbose:
+                logger.info(
+                    "Data parallel training is detected with moe parallel, MoeGradientHandler is automatically "
+                    "added even though not specified in the configuration",
+                    ranks=[0])
+        elif is_using_sequence():
+            model = DDP(model,
+                        process_group=gpc.get_group(ParallelMode.SEQUENCE_DP),
+                        device_ids=[torch.cuda.current_device()],
+                        **torch_ddp_cfg)
+            if verbose:
+                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Sequence Parallelism',
+                            ranks=[0])
+        elif is_using_ddp() and not is_using_pp() and amp_mode != AMP_TYPE.NAIVE:
+            model = DDP(model,
+                        process_group=gpc.get_group(ParallelMode.DATA),
+                        device_ids=[torch.cuda.current_device()],
+                        **torch_ddp_cfg)
+            if verbose:
+                logger.info('Model is using torch.nn.parallel.DistributedDataParallel for Data Parallelism', ranks=[0])
+        elif is_using_ddp():
+            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
+            if verbose:
+                logger.info(
+                    "Data parallel training is detected when using pipeline parallel, "
+                    "DataParallelGradientHandler is automatically "
+                    "added even though not specified in the configuration",
+                    ranks=[0])
+        # add pipeline parallel gradient handler, if pipeline shared module is detected
+        for param in model.parameters():
+            if getattr(param, 'pipeline_shared_module_pg', None) is not None:
+                if gradient_handler_cfg is None:
+                    gradient_handler_cfg = [dict(type='PipelineSharedModuleGradientHandler')]
+                else:
+                    gradient_handler_cfg.append(dict(type='PipelineSharedModuleGradientHandler'))
+                if verbose:
+                    logger.info(
+                        "pipeline_shared_module is detected, PipelineSharedModuleGradientHandler is automatically "
+                        "added even though not specified in the configuration",
+                        ranks=[0])
+                break
+    else:
+        if not isinstance(gradient_handler_cfg, list):
+            raise ConfigException(
+                f"expected gradient_handler in the configuration file to be a list but got {type(gradient_handler_cfg)}"
+            )
+
+    # turn off sync buffer for NaiveAMPModel if using torch DDP and NaiveAMPModel at the same time
+    # to avoid duplicated buffer synchronization
+    if isinstance(model, DDP) and isinstance(model.module, NaiveAMPModel):
+        model.module.sync_buffer = False
+
+    # initialize schedule for engine
+    if is_using_pp():
+        tensor_shape = get_tensor_shape()
+        use_interleaved = hasattr(gpc.config, 'model') and hasattr(gpc.config.model, 'num_chunks')
+        if gpc.is_initialized(ParallelMode.PARALLEL_1D):
+            scatter_gather = True
+        else:
+            scatter_gather = False
+        if use_interleaved:
+            if isinstance(model, nn.Sequential):
+                model = nn.ModuleList([model])
+            schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
+                                                   gpc.config.model.num_chunks,
+                                                   tensor_shape=tensor_shape,
+                                                   scatter_gather_tensors=scatter_gather)
+        else:
+            schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES,
+                                        tensor_shape=tensor_shape,
+                                        scatter_gather_tensors=scatter_gather)
+    else:
+        schedule = NonPipelineSchedule()
+
+    if gradient_handler_cfg is None:
+        gradient_handlers = None
+        if verbose and not isinstance(model, DDP):
+            logger.warning(
+                "No PyTorch DDP or gradient handler is set up, please make sure you do not need "
+                "to all-reduce the gradients after a training step.",
+                ranks=[0])
+    else:
+        gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg]
+
+    # check if optimizer is OptimizerWrapper
+    if not isinstance(optimizer, (OptimizerWrapper, ShardedOptimizerV2)):
+        optimizer = OptimizerWrapper(optim=optimizer)
+
+    # gradient accumulation
+    grad_accum_size = gpc.config.get('gradient_accumulation', None)
+    if grad_accum_size is not None:
+        optimizer, train_dataloader, gradient_handlers, lr_scheduler = accumulate_gradient(
+            model=model,
+            optimizer=optimizer,
+            dataloader=train_dataloader,
+            accumulate_size=grad_accum_size,
+            gradient_handlers=gradient_handlers,
+            lr_scheduler=lr_scheduler)
+    engine = Engine(model=model,
+                    optimizer=optimizer,
+                    criterion=criterion,
+                    gradient_handlers=gradient_handlers,
+                    clip_grad_norm=clip_grad_norm,
+                    ophook_list=ophooks,
+                    schedule=schedule)
+
+    return engine, train_dataloader, test_dataloader, lr_scheduler
--- a/colossalai/legacy/nn/init.py
+++ b/colossalai/legacy/nn/init.py
@ -1,4 +1,3 @@
-from ._ops import *
 from .layer import *
 from .loss import *
 from .metric import *
--- a/colossalai/legacy/nn/_ops/init.py
+++ b/colossalai/legacy/nn/_ops/init.py
@ -1,9 +1 @@
-from .addmm import colo_addmm
-from .batch_norm import colo_batch_norm
-from .element_wise import *
-from .embedding import colo_embedding
-from .embedding_bag import colo_embedding_bag
-from .layernorm import colo_layernorm
-from .linear import colo_linear
-from .loss import colo_cross_entropy
-from .view import colo_view
+from ._utils import *
--- a/colossalai/legacy/nn/_ops/_utils.py
+++ b/colossalai/legacy/nn/_ops/_utils.py
@ -3,9 +3,10 @@ from typing import List, Optional, Union
 import torch
 import torch.distributed as dist

-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.utils import divide
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
+from colossalai.legacy.tensor import ColoTensorSpec, ProcessGroup
+from colossalai.tensor import ColoTensor

 GeneralTensor = Union[ColoTensor, torch.Tensor]
 Number = Union[int, float]
--- a/colossalai/legacy/nn/_ops/addmm.py
+++ b/colossalai/legacy/nn/_ops/addmm.py
@ -1,90 +0,0 @@
-import torch
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, Number, convert_to_colo_tensor, reduce_grad, reduce_input
-
-
-def colo_addmm_1Drow(input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTensor, beta: Number,
-                     alpha: Number) -> ColoTensor:
-    # mat1:S[1] x mat2:S[0] = Output:P
-    # beta * input + alpha * All-Reduce(Output) = res
-
-    mat1 = mat1.redistribute(ShardSpec([-1], [mat2.get_tp_world_size()]), mat2.get_process_group())
-
-    # Output:P
-    partial_output = torch.mm(mat1, mat2)
-    # Reduce(Output)
-    output = reduce_input(partial_output, mat2.get_process_group())
-    # input
-    assert not input_tensor.has_compute_spec(), 'Invalid input spec for 1Drow addmm op'
-    output = beta * input_tensor + alpha * output
-    output = ColoTensor.from_torch_tensor(output, spec=ColoTensorSpec(input_tensor.get_process_group()))
-    return output
-
-
-def colo_addmm_1Dcol(input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTensor, beta: Number,
-                     alpha: Number) -> ColoTensor:
-    # mat1:B x mat2:S[1] + input:S[1] = Output:S[1]
-    compute_spec = mat2.compute_spec
-    mat1 = mat1.redistribute(ReplicaSpec())
-    mat1 = reduce_grad(mat1, mat1.get_process_group())
-
-    output_parallel = torch.addmm(input_tensor, mat1, mat2, beta=beta, alpha=alpha)
-    output_spec = ColoTensorSpec(input_tensor.get_process_group(), ShardSpec([-1], [mat2.get_tp_world_size()]),
-                                 ComputeSpec(ComputePattern.TP1D))
-    output = ColoTensor.from_torch_tensor(output_parallel, spec=output_spec)
-
-    if compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_addmm_1d(mode: str, input_tensor: ColoTensor, mat1: ColoTensor, mat2: ColoTensor, beta: Number,
-                  alpha: Number) -> ColoTensor:
-    assert mode in ('row', 'col')
-    funcs = {'row': colo_addmm_1Drow, 'col': colo_addmm_1Dcol}
-    return funcs[mode](input_tensor, mat1, mat2, beta, alpha)
-
-
-@colo_op_impl(torch.addmm)
-def colo_addmm(input_tensor: GeneralTensor,
-               mat1: ColoTensor,
-               mat2: ColoTensor,
-               beta: Number = 1,
-               alpha: Number = 1,
-               **kargs) -> ColoTensor:
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.linear``.
-    This method computes a linear.
-    """
-    # At least one of the tensor should be ColoTensor
-    assert isinstance(mat2, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, mat2.get_process_group())
-    mat1 = convert_to_colo_tensor(mat1, mat2.get_process_group())
-
-    # Add communication logic before and after linear call.
-    ret_tensor = None
-    if not mat2.has_compute_spec():    # No Model Parallel Applied
-        assert mat2.is_replicate(), 'Invalid mat2 spec for native addmm op'
-        assert input_tensor.is_replicate(), 'Invalid input spec for native addmm op'
-        ret_tensor = ColoTensor.from_torch_tensor(tensor=torch.addmm(input_tensor,
-                                                                     mat1,
-                                                                     mat2,
-                                                                     beta=beta,
-                                                                     alpha=alpha,
-                                                                     **kargs),
-                                                  spec=ColoTensorSpec(mat2.get_process_group()))
-    elif mat2.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if mat2.is_shard_1drow() and input_tensor.is_replicate():
-            mode = 'row'
-        elif mat2.is_shard_1dcol() and (input_tensor.is_shard_1dcol() or input_tensor.is_shard_1drow()):
-            mode = 'col'
-        else:
-            raise NotImplementedError
-        ret_tensor = colo_addmm_1d(mode, input_tensor, mat1, mat2, beta, alpha)
-    else:
-        raise NotImplementedError
-
-    return ret_tensor
--- a/colossalai/legacy/nn/_ops/batch_norm.py
+++ b/colossalai/legacy/nn/_ops/batch_norm.py
@ -1,33 +0,0 @@
-from typing import Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-@colo_op_impl(F.batch_norm)
-def colo_batch_norm(
-    input: GeneralTensor,
-    running_mean: Optional[GeneralTensor],
-    running_var: Optional[GeneralTensor],
-    weight: Optional[GeneralTensor] = None,
-    bias: Optional[GeneralTensor] = None,
-    training: bool = False,
-    momentum: float = 0.1,
-    eps: float = 1e-5,
-):
-    assert isinstance(weight, ColoTensor)
-    running_mean = running_mean.detach()
-    running_var = running_var.detach()
-
-    input = convert_to_colo_tensor(input, weight.get_process_group())
-    bias = convert_to_colo_tensor(bias, weight.get_process_group())
-    input = input.redistribute(ReplicaSpec())
-    bias = bias.redistribute(ReplicaSpec())
-
-    output = F.batch_norm(input, running_mean, running_var, weight, bias, training, momentum, eps)
-    output = ColoTensor.from_torch_tensor(tensor=output, spec=ColoTensorSpec(pg=weight.get_process_group()))
-    return output
--- a/colossalai/legacy/nn/_ops/element_wise.py
+++ b/colossalai/legacy/nn/_ops/element_wise.py
@ -1,250 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-def register_elementwise_op(op):
-
-    @colo_op_impl(op)
-    def elementwise_op(input_tensor: GeneralTensor, *args, **kwargs):
-        """
-        Handles ``__torch_function__`` dispatch for the elementwise op such
-        as ``torch.nn.functional.gelu`` or ``torch.nn.functional.relu``.
-        This method computes on either a normal tensor or a sharded tensor.
-        """
-        if 'inplace' in kwargs:
-            # TODO(jiaruifang) inplace will cause bugs
-            input_tensor = input_tensor.clone()
-            return op(input_tensor, *args, **kwargs)
-        else:
-            output = op(input_tensor, *args, **kwargs)
-            # return output
-            if isinstance(input_tensor, ColoTensor):
-                if isinstance(output, str):
-                    return output
-                if not isinstance(output, torch.Tensor):
-                    raise NotImplementedError
-                return ColoTensor.from_torch_tensor(output,
-                                                    spec=ColoTensorSpec(input_tensor.get_process_group(),
-                                                                        dist_attr=input_tensor.dist_spec))
-
-
-# @colo_op_impl(torch.relu_)
-# def elementwise_op(input_tensor):
-#     torch.relu_(input_tensor.data)
-#     return input_tensor
-
-# @colo_op_impl(Tensor.add_)
-# def elementwise_op(input_tensor: ColoTensor, *args, **kwargs):
-#     input_tensor = input_tensor.data.add_(*args, **kwargs)
-#     return input_tensor
-
-# Tensor op
-register_elementwise_op(Tensor.abs)
-register_elementwise_op(Tensor.absolute)
-register_elementwise_op(Tensor.acos)
-register_elementwise_op(Tensor.arccos)
-register_elementwise_op(Tensor.angle)
-register_elementwise_op(Tensor.asin)
-register_elementwise_op(Tensor.arcsin)
-register_elementwise_op(Tensor.atan)
-register_elementwise_op(Tensor.arctan)
-register_elementwise_op(Tensor.all)
-register_elementwise_op(Tensor.any)
-register_elementwise_op(Tensor.bernoulli)
-register_elementwise_op(Tensor.bfloat16)
-register_elementwise_op(Tensor.bitwise_not)
-register_elementwise_op(Tensor.bool)
-register_elementwise_op(Tensor.byte)
-register_elementwise_op(Tensor.ceil)
-register_elementwise_op(Tensor.char)
-register_elementwise_op(Tensor.clamp)
-register_elementwise_op(Tensor.clamp_max)
-register_elementwise_op(Tensor.clamp_min)
-register_elementwise_op(Tensor.clip)
-register_elementwise_op(Tensor.clone)
-register_elementwise_op(Tensor.contiguous)
-register_elementwise_op(Tensor.copysign)
-register_elementwise_op(Tensor.cos)
-register_elementwise_op(Tensor.cosh)
-register_elementwise_op(Tensor.acosh)
-register_elementwise_op(Tensor.arccosh)
-register_elementwise_op(Tensor.cpu)
-register_elementwise_op(Tensor.cuda)
-register_elementwise_op(Tensor.deg2rad)
-register_elementwise_op(Tensor.detach)
-register_elementwise_op(Tensor.digamma)
-register_elementwise_op(Tensor.double)
-register_elementwise_op(Tensor.erf)
-register_elementwise_op(Tensor.erfc)
-register_elementwise_op(Tensor.erfinv)
-register_elementwise_op(Tensor.exp)
-register_elementwise_op(Tensor.expm1)
-register_elementwise_op(Tensor.fix)
-register_elementwise_op(Tensor.trunc)
-register_elementwise_op(Tensor.float)
-register_elementwise_op(Tensor.float_power)
-register_elementwise_op(Tensor.floor)
-register_elementwise_op(Tensor.frac)
-register_elementwise_op(Tensor.half)
-register_elementwise_op(Tensor.hardshrink)
-register_elementwise_op(Tensor.heaviside)
-register_elementwise_op(Tensor.i0)
-register_elementwise_op(Tensor.int)
-register_elementwise_op(Tensor.isfinite)
-register_elementwise_op(Tensor.isinf)
-register_elementwise_op(Tensor.isposinf)
-register_elementwise_op(Tensor.isneginf)
-register_elementwise_op(Tensor.isnan)
-register_elementwise_op(Tensor.lgamma)
-register_elementwise_op(Tensor.log)
-register_elementwise_op(Tensor.log10)
-register_elementwise_op(Tensor.log1p)
-register_elementwise_op(Tensor.log2)
-register_elementwise_op(Tensor.logical_not)
-register_elementwise_op(Tensor.logit)
-register_elementwise_op(Tensor.long)
-register_elementwise_op(Tensor.nan_to_num)
-register_elementwise_op(Tensor.neg)
-register_elementwise_op(Tensor.negative)
-register_elementwise_op(Tensor.positive)
-register_elementwise_op(Tensor.pow)
-register_elementwise_op(Tensor.rad2deg)
-register_elementwise_op(Tensor.reciprocal)
-register_elementwise_op(Tensor.round)
-register_elementwise_op(Tensor.rsqrt)
-register_elementwise_op(Tensor.short)
-register_elementwise_op(Tensor.sigmoid)
-register_elementwise_op(Tensor.sign)
-register_elementwise_op(Tensor.signbit)
-register_elementwise_op(Tensor.sgn)
-register_elementwise_op(Tensor.sin)
-register_elementwise_op(Tensor.sinc)
-register_elementwise_op(Tensor.sinh)
-register_elementwise_op(Tensor.asinh)
-register_elementwise_op(Tensor.arcsinh)
-register_elementwise_op(Tensor.sqrt)
-register_elementwise_op(Tensor.square)
-register_elementwise_op(Tensor.to)
-register_elementwise_op(Tensor.tan)
-register_elementwise_op(Tensor.tanh)
-register_elementwise_op(Tensor.atanh)
-register_elementwise_op(Tensor.arctanh)
-register_elementwise_op(Tensor.type)
-register_elementwise_op(Tensor.type_as)
-
-# torch OP
-register_elementwise_op(torch.abs)
-register_elementwise_op(torch.absolute)
-register_elementwise_op(torch.acos)
-register_elementwise_op(torch.arccos)
-register_elementwise_op(torch.angle)
-register_elementwise_op(torch.asin)
-register_elementwise_op(torch.arcsin)
-register_elementwise_op(torch.atan)
-register_elementwise_op(torch.arctan)
-register_elementwise_op(torch.all)
-register_elementwise_op(torch.any)
-register_elementwise_op(torch.bernoulli)
-register_elementwise_op(torch.bitwise_not)
-register_elementwise_op(torch.ceil)
-register_elementwise_op(torch.clamp)
-register_elementwise_op(torch.clamp_max)
-register_elementwise_op(torch.clamp_min)
-register_elementwise_op(torch.clip)
-register_elementwise_op(torch.clone)
-register_elementwise_op(torch.copysign)
-register_elementwise_op(torch.cos)
-register_elementwise_op(torch.cosh)
-register_elementwise_op(torch.acosh)
-register_elementwise_op(torch.arccosh)
-register_elementwise_op(torch.deg2rad)
-register_elementwise_op(torch.digamma)
-register_elementwise_op(torch.erf)
-register_elementwise_op(torch.erfc)
-register_elementwise_op(torch.erfinv)
-register_elementwise_op(torch.exp)
-register_elementwise_op(torch.expm1)
-register_elementwise_op(torch.fix)
-register_elementwise_op(torch.trunc)
-register_elementwise_op(torch.float_power)
-register_elementwise_op(torch.floor)
-register_elementwise_op(torch.frac)
-register_elementwise_op(torch.hardshrink)
-register_elementwise_op(torch.heaviside)
-register_elementwise_op(torch.i0)
-register_elementwise_op(torch.isfinite)
-register_elementwise_op(torch.isinf)
-register_elementwise_op(torch.isposinf)
-register_elementwise_op(torch.isneginf)
-register_elementwise_op(torch.isnan)
-register_elementwise_op(torch.lgamma)
-register_elementwise_op(torch.log)
-register_elementwise_op(torch.log10)
-register_elementwise_op(torch.log1p)
-register_elementwise_op(torch.log2)
-register_elementwise_op(torch.logical_not)
-register_elementwise_op(torch.logit)
-register_elementwise_op(torch.nan_to_num)
-register_elementwise_op(torch.neg)
-register_elementwise_op(torch.negative)
-register_elementwise_op(torch.positive)
-register_elementwise_op(torch.pow)
-register_elementwise_op(torch.rad2deg)
-register_elementwise_op(torch.reciprocal)
-register_elementwise_op(torch.round)
-register_elementwise_op(torch.rsqrt)
-register_elementwise_op(torch.sigmoid)
-register_elementwise_op(torch.sign)
-register_elementwise_op(torch.signbit)
-register_elementwise_op(torch.sgn)
-register_elementwise_op(torch.sin)
-register_elementwise_op(torch.sinc)
-register_elementwise_op(torch.sinh)
-register_elementwise_op(torch.asinh)
-register_elementwise_op(torch.arcsinh)
-register_elementwise_op(torch.sqrt)
-register_elementwise_op(torch.square)
-register_elementwise_op(torch.tan)
-register_elementwise_op(torch.tanh)
-register_elementwise_op(torch.atanh)
-register_elementwise_op(torch.arctanh)
-register_elementwise_op(torch.zeros_like)
-
-# nn.functional OP
-register_elementwise_op(F.threshold)
-register_elementwise_op(F.relu)
-register_elementwise_op(F.hardtanh)
-register_elementwise_op(F.hardswish)
-register_elementwise_op(F.relu6)
-register_elementwise_op(F.elu)
-register_elementwise_op(F.selu)
-register_elementwise_op(F.celu)
-register_elementwise_op(F.leaky_relu)
-register_elementwise_op(F.prelu)
-register_elementwise_op(F.rrelu)
-register_elementwise_op(F.gelu)
-register_elementwise_op(F.logsigmoid)
-register_elementwise_op(F.hardshrink)
-register_elementwise_op(F.tanhshrink)
-register_elementwise_op(F.softsign)
-register_elementwise_op(F.softplus)
-register_elementwise_op(F.softmin)
-register_elementwise_op(F.softmax)
-register_elementwise_op(F.softshrink)
-register_elementwise_op(F.gumbel_softmax)
-register_elementwise_op(F.log_softmax)
-register_elementwise_op(F.tanh)
-register_elementwise_op(F.sigmoid)
-register_elementwise_op(F.hardsigmoid)
-register_elementwise_op(F.silu)
-register_elementwise_op(F.mish)
-# TODO(ver217): dropout handles seed
-register_elementwise_op(F.dropout)
-register_elementwise_op(F.alpha_dropout)
-register_elementwise_op(F.feature_alpha_dropout)
--- a/colossalai/legacy/nn/_ops/embedding.py
+++ b/colossalai/legacy/nn/_ops/embedding.py
@ -1,142 +0,0 @@
-from typing import Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
-
-
-def colo_embedding_1Dcol(input_tensor: ColoTensor,
-                         weight: ColoTensor,
-                         padding_idx: Optional[int] = None,
-                         max_norm: Optional[float] = None,
-                         norm_type: float = 2.0,
-                         scale_grad_by_freq: bool = False,
-                         sparse: bool = False) -> ColoTensor:
-    # embedding_1Dcol split the weight(lookup table) to (num_embeddings, embedding_dim/P)
-    # Gather splitted lookup table
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    output_parallel = F.embedding(input_tensor,
-                                  weight,
-                                  padding_idx=padding_idx,
-                                  max_norm=max_norm,
-                                  norm_type=norm_type,
-                                  scale_grad_by_freq=scale_grad_by_freq,
-                                  sparse=sparse)
-    output_spec = ColoTensorSpec(weight.get_process_group(), ShardSpec([-1], [weight.get_tp_world_size()]),
-                                 ComputeSpec(ComputePattern.TP1D))
-    output = ColoTensor.from_torch_tensor(output_parallel, spec=output_spec)
-
-    compute_spec = weight.compute_spec
-
-    if compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_embedding_1Drow(input_tensor: ColoTensor,
-                         weight: ColoTensor,
-                         padding_idx: Optional[int] = None,
-                         max_norm: Optional[float] = None,
-                         norm_type: float = 2.0,
-                         scale_grad_by_freq: bool = False,
-                         sparse: bool = False) -> ColoTensor:
-    # embedding_1Drow splits the weight(lookup table) to the shape, [num_embeddings/P, embedding_dim]
-    # get the index of current segment and mask other segments with 0
-
-    # get complete input tensor through all-gather
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    # tensor_parallel_rank = gpc.get_local_rank(ParallelMode.PARALLEL_1D)
-    tensor_parallel_rank = weight.get_process_group().tp_local_rank()
-    num_embeddings_per_partition = weight.size_local(0)
-    vocab_start_index = tensor_parallel_rank * num_embeddings_per_partition
-    vocab_end_index = vocab_start_index + num_embeddings_per_partition
-
-    # build the mask.
-    input_mask = (input_tensor < vocab_start_index) | (input_tensor >= vocab_end_index)
-    # mask the input.
-    # TODO(jzy) masked_input may be an activation managed by ColoTensor.
-    masked_input = input_tensor - vocab_start_index
-    masked_input[input_mask] = 0
-
-    partial_output = F.embedding(masked_input,
-                                 weight,
-                                 padding_idx=padding_idx,
-                                 max_norm=max_norm,
-                                 norm_type=norm_type,
-                                 scale_grad_by_freq=scale_grad_by_freq,
-                                 sparse=sparse)
-
-    # Mask the output embedding.
-    partial_output[input_mask, :] = 0.
-    # Reduce across all the model parallel GPUs.
-    output = reduce_input(partial_output, weight.get_process_group())
-    output = ColoTensor.from_torch_tensor(output, spec=ColoTensorSpec(weight.get_process_group(), ReplicaSpec()))
-    return output
-
-
-def colo_embedding_1d(mode: str,
-                      input_tensor: ColoTensor,
-                      weight: ColoTensor,
-                      padding_idx: Optional[int] = None,
-                      max_norm: Optional[float] = None,
-                      norm_type: float = 2.0,
-                      scale_grad_by_freq: bool = False,
-                      sparse: bool = False) -> ColoTensor:
-    assert mode in ('row', 'col')
-    funcs = {'row': colo_embedding_1Drow, 'col': colo_embedding_1Dcol}
-    return funcs[mode](input_tensor,
-                       weight,
-                       padding_idx=padding_idx,
-                       max_norm=max_norm,
-                       norm_type=norm_type,
-                       scale_grad_by_freq=scale_grad_by_freq,
-                       sparse=sparse)
-
-
-@colo_op_impl(F.embedding)
-def colo_embedding(input_tensor: GeneralTensor,
-                   weight: GeneralTensor,
-                   padding_idx: Optional[int] = None,
-                   max_norm: Optional[float] = None,
-                   norm_type: float = 2.0,
-                   scale_grad_by_freq: bool = False,
-                   sparse: bool = False):
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding``.
-    This method looks up an embedding table.
-    """
-    assert isinstance(weight, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
-
-    if not weight.has_compute_spec():    # No Model Parallel Applied
-        assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
-        return ColoTensor.from_torch_tensor(tensor=F.embedding(input_tensor,
-                                                               weight,
-                                                               padding_idx=padding_idx,
-                                                               max_norm=max_norm,
-                                                               norm_type=norm_type,
-                                                               scale_grad_by_freq=scale_grad_by_freq,
-                                                               sparse=sparse),
-                                            spec=ColoTensorSpec(weight.get_process_group()))
-    elif weight.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if weight.is_shard_1drow():
-            mode = 'row'
-        elif weight.is_shard_1dcol():
-            mode = 'col'
-        else:
-            raise NotImplementedError
-        return colo_embedding_1d(mode,
-                                 input_tensor,
-                                 weight,
-                                 padding_idx=padding_idx,
-                                 max_norm=max_norm,
-                                 norm_type=norm_type,
-                                 scale_grad_by_freq=scale_grad_by_freq,
-                                 sparse=sparse)
-    else:
-        raise NotImplementedError
--- a/colossalai/legacy/nn/_ops/embedding_bag.py
+++ b/colossalai/legacy/nn/_ops/embedding_bag.py
@ -1,127 +0,0 @@
-from typing import Optional
-
-import torch.nn.functional as F
-from torch import Tensor
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-def colo_embedding_bag_1Dcol(input_tensor: ColoTensor,
-                             weight: ColoTensor,
-                             offsets: Optional[Tensor] = None,
-                             max_norm: Optional[float] = None,
-                             norm_type: float = 2,
-                             scale_grad_by_freq: bool = False,
-                             mode: str = "mean",
-                             sparse: bool = False,
-                             per_sample_weights: Optional[Tensor] = None,
-                             include_last_offset: bool = False,
-                             padding_idx: Optional[int] = None) -> ColoTensor:
-    # embedding_bag_1Dcol split the weight(lookup table) to (num_embeddings, embedding_dim/P)
-    # Gather splitted lookup table
-    pg = weight.get_process_group()
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    output_parallel = F.embedding_bag(input_tensor,
-                                      weight,
-                                      offsets=offsets,
-                                      max_norm=max_norm,
-                                      norm_type=norm_type,
-                                      scale_grad_by_freq=scale_grad_by_freq,
-                                      mode=mode,
-                                      sparse=sparse,
-                                      per_sample_weights=per_sample_weights,
-                                      include_last_offset=include_last_offset,
-                                      padding_idx=padding_idx)
-    output_spec = ColoTensorSpec(pg, ShardSpec([-1], [weight.get_tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    output = ColoTensor.from_torch_tensor(output_parallel, spec=output_spec)
-
-    if weight.compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_embedding_bag_1d(tp_mode: str,
-                          input_tensor: ColoTensor,
-                          weight: ColoTensor,
-                          offsets: Optional[Tensor] = None,
-                          max_norm: Optional[float] = None,
-                          norm_type: float = 2,
-                          scale_grad_by_freq: bool = False,
-                          mode: str = "mean",
-                          sparse: bool = False,
-                          per_sample_weights: Optional[Tensor] = None,
-                          include_last_offset: bool = False,
-                          padding_idx: Optional[int] = None) -> ColoTensor:
-    assert tp_mode in ('col',)
-    funcs = {'col': colo_embedding_bag_1Dcol}
-    return funcs[tp_mode](input_tensor,
-                          weight,
-                          offsets=offsets,
-                          max_norm=max_norm,
-                          norm_type=norm_type,
-                          scale_grad_by_freq=scale_grad_by_freq,
-                          mode=mode,
-                          sparse=sparse,
-                          per_sample_weights=per_sample_weights,
-                          include_last_offset=include_last_offset,
-                          padding_idx=padding_idx)
-
-
-@colo_op_impl(F.embedding_bag)
-def colo_embedding_bag(input_tensor: GeneralTensor,
-                       weight: GeneralTensor,
-                       offsets: Optional[Tensor] = None,
-                       max_norm: Optional[float] = None,
-                       norm_type: float = 2,
-                       scale_grad_by_freq: bool = False,
-                       mode: str = "mean",
-                       sparse: bool = False,
-                       per_sample_weights: Optional[Tensor] = None,
-                       include_last_offset: bool = False,
-                       padding_idx: Optional[int] = None):
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding_bag``.
-    This method looks up an embedding table.
-    """
-    assert isinstance(weight, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
-
-    # Handle different parallel actions.
-
-    if not weight.has_compute_spec():    # No Model Parallel Applied
-        assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
-        return ColoTensor.from_torch_tensor(tensor=F.embedding_bag(input_tensor,
-                                                                   weight,
-                                                                   offsets=offsets,
-                                                                   max_norm=max_norm,
-                                                                   norm_type=norm_type,
-                                                                   scale_grad_by_freq=scale_grad_by_freq,
-                                                                   mode=mode,
-                                                                   sparse=sparse,
-                                                                   per_sample_weights=per_sample_weights,
-                                                                   include_last_offset=include_last_offset,
-                                                                   padding_idx=padding_idx),
-                                            spec=ColoTensorSpec(weight.get_process_group()))
-    elif weight.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if weight.is_shard_1dcol():
-            tp_mode = 'col'
-        else:
-            raise NotImplementedError
-        return colo_embedding_bag_1d(tp_mode,
-                                     input_tensor,
-                                     weight,
-                                     offsets=offsets,
-                                     max_norm=max_norm,
-                                     norm_type=norm_type,
-                                     scale_grad_by_freq=scale_grad_by_freq,
-                                     mode=mode,
-                                     sparse=sparse,
-                                     per_sample_weights=per_sample_weights,
-                                     include_last_offset=include_last_offset,
-                                     padding_idx=padding_idx)
-    else:
-        raise NotImplementedError
--- a/colossalai/legacy/nn/_ops/layernorm.py
+++ b/colossalai/legacy/nn/_ops/layernorm.py
@ -1,28 +0,0 @@
-from typing import List, Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-@colo_op_impl(F.layer_norm)
-def colo_layernorm(
-    input_tensor: GeneralTensor,
-    normalized_shape: List[int],
-    weight: Optional[GeneralTensor] = None,
-    bias: Optional[GeneralTensor] = None,
-    eps: float = 1e-5,
-):
-    assert isinstance(weight, ColoTensor)
-    input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
-    bias = convert_to_colo_tensor(bias, weight.get_process_group())
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-
-    output = F.layer_norm(input_tensor, normalized_shape, weight=weight, bias=bias, eps=eps)
-    output = ColoTensor.from_torch_tensor(tensor=output,
-                                          spec=ColoTensorSpec(pg=input_tensor.get_process_group(),
-                                                              dist_attr=input_tensor.dist_spec))
-    return output
--- a/colossalai/legacy/nn/_ops/linear.py
+++ b/colossalai/legacy/nn/_ops/linear.py
@ -1,171 +0,0 @@
-from copy import deepcopy
-from typing import Optional
-
-import torch.nn.functional as F
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_grad, reduce_input
-
-
-def colo_linear_1drow(input_tensor: ColoTensor, weight: ColoTensor, bias: Optional[ColoTensor]) -> 'ColoTensor':
-    # Input:S[1] x Weight:S[0] = Output:P
-    # All-Reduce(Output) + bias = res
-    # Input:S[1]
-    pg = weight.get_process_group()
-    input_tensor = input_tensor.redistribute(ShardSpec([-1], [weight.get_tp_world_size()]), pg)
-
-    # Output:P
-    partial_output = F.linear(input_tensor, weight)
-    # Reduce(Output)
-
-    output = reduce_input(partial_output, pg)
-    # Bias
-    if bias is not None:
-        assert not bias.has_compute_spec(), 'Invalid bias spec for 1Drow Linear op'
-        output = output + bias
-
-    output = ColoTensor.from_torch_tensor(output, spec=ColoTensorSpec(pg, ReplicaSpec()))
-    return output
-
-
-def colo_linear_1dcol(input_tensor: ColoTensor, weight: ColoTensor, bias: Optional[ColoTensor]) -> 'ColoTensor':
-    # Input:B x Weight:S[1] + Bias:S[1] = Output:S[1]
-    # All-Gather(Output)
-    # Input:B
-    compute_spec = weight.compute_spec
-    input_tensor = input_tensor.redistribute(ReplicaSpec())
-    input_parallel = reduce_grad(input_tensor, weight.get_process_group())
-
-    output_parallel = F.linear(input_parallel, weight, bias)
-    output = ColoTensor.from_torch_tensor(output_parallel,
-                                          spec=ColoTensorSpec(weight.get_process_group(),
-                                                              ShardSpec([-1], [weight.get_tp_world_size()]),
-                                                              ComputeSpec(ComputePattern.TP1D)))
-    if compute_spec.output_replicate:
-        return output.to_replicate()
-    else:
-        return output
-
-
-def colo_linear_1d(mode: str, input_tensor: ColoTensor, weight: ColoTensor, bias: Optional[ColoTensor]) -> 'ColoTensor':
-    assert mode in ('row', 'col')
-    funcs = {'row': colo_linear_1drow, 'col': colo_linear_1dcol}
-    return funcs[mode](input_tensor, weight, bias)
-
-
-# @register_colo_graph(input_pos=[1], param_pos=[2, 3])
-def colo_linear_imp(input_tensor: GeneralTensor,
-                    weight: GeneralTensor,
-                    bias: Optional[GeneralTensor] = None) -> 'ColoTensor':
-    """Handles ``__torch_function__`` dispatch for ``torch.nn.functional.linear``.
-    This method computes a linear.
-    """
-    assert isinstance(weight, ColoTensor)
-    pg = weight.get_process_group()
-    assert pg
-    input_tensor = convert_to_colo_tensor(input_tensor, pg)
-    bias = convert_to_colo_tensor(bias, pg)
-    # input_tensor, weight, bias = tuple(map(convert_to_colo_tensor, (input_tensor, weight, bias)))
-
-    # Add communication logic before and after linear call.
-    ret_tensor = None
-    if not weight.has_compute_spec():    # No Model Parallel Applied
-        assert weight.is_replicate(), 'Invalid weight spec for native Linear op'
-        assert bias is None or bias.is_replicate(), 'Invalid bias spec for native Linear op'
-        ret_tensor = ColoTensor.from_torch_tensor(F.linear(input_tensor, weight, bias), spec=ColoTensorSpec(pg))
-    elif weight.has_compute_pattern(ComputePattern.TP1D):    # Single Model Parallel Applied
-        if weight.is_shard_1dcol() and (bias is None or bias.is_replicate()):
-            mode = 'row'
-        elif weight.is_shard_1drow() and (bias is None or bias.is_shard_1drow() or bias.is_shard_1dcol()):
-            mode = 'col'
-        else:
-            raise RuntimeError(f"the weight or bias tensor spec is not valid, weight {weight}, bias {bias}")
-        ret_tensor = colo_linear_1d(mode, input_tensor, weight, bias)
-    else:
-        raise NotImplementedError
-
-    return ret_tensor
-
-
-def _new_colo_linear_imp(input_tensor: GeneralTensor,
-                         weight: GeneralTensor,
-                         bias: Optional[GeneralTensor] = None) -> 'ColoTensor':
-    """
-    A tentative function to compute the distributed linear layer with the latest sharding spec.
-    This function is subject to future change as the current sharding API is not stable.
-    """
-    # get mesh info
-    input_sharding_seq = input_tensor.sharding_spec.sharding_sequence
-    weight_sharding_seq = weight.sharding_spec.sharding_sequence
-    if bias is not None:
-        bias_sharding_seq = bias.sharding_spec.sharding_sequence
-    device_mesh = weight.sharding_spec.device_mesh
-    pg_axis0 = weight.pg_axis0
-    pg_axis1 = weight.pg_axis1
-
-    # the last dim of input should have the same spec as the first dim of weight
-    # the weight is transposed, so we look at the second dimension
-    assert input_sharding_seq[-1] == weight_sharding_seq[1]
-
-    if bias is not None:
-        assert bias_sharding_seq[0] == weight_sharding_seq[0]
-
-    # compute the output sharding sequence
-    # as weight is transposed, so we look at the first dimension
-    output_shard_seq = input_sharding_seq[:-1] + weight_sharding_seq[:1]
-    output_shard_seq = deepcopy(output_shard_seq)
-
-    # TODO: add reduce grad logic
-
-    # handle column and row parallel linear
-    # by reusing the implementation above
-    out = F.linear(input_tensor, weight)
-
-    # run all reduce if necessary
-    last_dim_spec = input_sharding_seq[-1]
-    if last_dim_spec.is_replica:
-        pass
-    elif last_dim_spec.shard_list is not None:
-        for dim in last_dim_spec.shard_list:
-            if dim == 0:
-                reduce_input(out, pg_axis0)
-            elif dim == 1:
-                reduce_input(out, pg_axis1)
-            else:
-                raise RuntimeError("Found invalid sharding axis {dim}, only 0 or 1 is expected")
-    # add bias
-    if bias is not None:
-        out += bias
-
-    # convert shard seq to partition dict
-    output_partition_dict = {}
-    for index, dim_spec in enumerate(output_shard_seq):
-        if not dim_spec.is_replica:
-            if index not in output_partition_dict:
-                output_partition_dict[index] = []
-            output_partition_dict[index].extend(dim_spec.shard_list)
-
-    entire_shape = out.shape
-    output_sharding_spec = ShardingSpec(device_mesh, entire_shape, output_partition_dict)
-    ret_tensor = ColoTensor.from_torch_tensor(out)
-    setattr(ret_tensor, 'sharding_spec', output_sharding_spec)
-    return ret_tensor
-
-
-def _has_sharding_spec(tensor):
-    """
-    A tentative function to check whether the tensor is using the new sharding spec API. We assume that the sharding spec object is
-    set as the attribute `sharding_spec` on a tensor.
-    """
-    return hasattr(tensor, 'sharding_spec')
-
-
-@colo_op_impl(F.linear)
-def colo_linear(input: GeneralTensor, weight: GeneralTensor, bias: Optional[GeneralTensor] = None) -> 'ColoTensor':
-    if _has_sharding_spec(weight):
-        return _new_colo_linear_imp(input, weight, bias)
-    else:
-        return colo_linear_imp(input, weight, bias)
--- a/colossalai/legacy/nn/_ops/loss.py
+++ b/colossalai/legacy/nn/_ops/loss.py
@ -1,51 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-
-from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
-from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-from ._utils import GeneralTensor, convert_to_colo_tensor
-
-
-@colo_op_impl(F.cross_entropy)
-def colo_cross_entropy(input_tensor: GeneralTensor,
-                       target: GeneralTensor,
-                       weight: Optional[GeneralTensor] = None,
-                       size_average: Optional[bool] = None,
-                       ignore_index: int = -100,
-                       reduce: Optional[bool] = None,
-                       reduction: str = "mean",
-                       label_smoothing: float = 0.0):
-    assert isinstance(weight, ColoTensor) or isinstance(target, ColoTensor) or isinstance(input_tensor, ColoTensor)
-    pg = input_tensor.get_process_group() if isinstance(input_tensor, ColoTensor) else isinstance(target, ColoTensor)
-    weight = convert_to_colo_tensor(weight, pg)
-    target = convert_to_colo_tensor(target, pg)
-    input_tensor = convert_to_colo_tensor(input_tensor, pg)
-
-    if input_tensor.is_replicate():    # Input is gathered
-        assert target.is_replicate() and (weight is None or weight.is_replicate()), \
-            "Target tensor and weight tensor both should be complete"
-        output = F.cross_entropy(input_tensor,
-                                 target,
-                                 weight=weight,
-                                 size_average=size_average,
-                                 ignore_index=ignore_index,
-                                 reduce=reduce,
-                                 reduction=reduction,
-                                 label_smoothing=label_smoothing)
-        return ColoTensor.from_torch_tensor(output, ColoTensorSpec(pg))
-    elif input_tensor.has_compute_spec():    # Single Model Parallel Applied
-        if input_tensor.is_shard_1dcol():
-            assert weight is None, "Current TP cross entropy loss function doesn't support passing weight tensor in"
-            assert target.is_replicate(), "Target tensor should be complete in TP cross entropy loss function"
-            output = VocabParallelCrossEntropyLoss1D()(input_tensor,
-                                                       target,
-                                                       process_group=input_tensor.process_group.tp_process_group())
-            return ColoTensor.from_torch_tensor(output, ColoTensorSpec(pg))
-        else:
-            raise NotImplementedError
-    else:
-        raise NotImplementedError
--- a/colossalai/legacy/nn/_ops/view.py
+++ b/colossalai/legacy/nn/_ops/view.py
@ -1,96 +0,0 @@
-import operator
-from functools import reduce
-from typing import Optional, Union
-
-import torch
-
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec
-from colossalai.tensor.op_wrapper import colo_op_impl
-
-
-def _all_int(my_iter):
-    return all(isinstance(i, int) for i in my_iter)
-
-
-def _get_valid_shape(shape):
-    if isinstance(shape, list):
-        if _all_int(shape):
-            return tuple(shape)
-        else:
-            raise RuntimeError("expects type(int) but finds an other type")
-    elif isinstance(shape, tuple):
-        if _all_int(shape):
-            return shape
-        else:
-            return _get_valid_shape(shape[0])
-    else:
-        raise RuntimeError("expects an iterable array but finds '{}'".format(type(shape)))
-
-
-def _shape_infer(org_sp, tgt_sp):
-    cnt = 0
-    pos = 0
-    for idx, dim in enumerate(tgt_sp):
-        if dim < -1:
-            raise RuntimeError("invalid shape dimension {}".format(dim))
-        elif dim == -1:
-            cnt += 1
-            pos = idx
-
-    if cnt > 1:
-        raise RuntimeError("only one dimension can be inferred")
-
-    org_prod = reduce(operator.mul, org_sp, 1)
-    tgt_prod = reduce(operator.mul, tgt_sp, 1)
-
-    if cnt == 0:
-        if org_prod != tgt_prod:
-            raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
-        else:
-            return tgt_sp
-    elif org_prod % tgt_prod != 0:
-        raise RuntimeError("shape '{}' is invalid for input of size {}".format(tgt_sp, org_prod))
-
-    infer_dim = -(org_prod // tgt_prod)
-    return tgt_sp[:pos] + (infer_dim,) + tgt_sp[pos + 1:]
-
-
-@colo_op_impl(torch.Tensor.view)
-def colo_view(self: ColoTensor, *shape) -> 'ColoTensor':
-    """Handles ``__torch_function__`` dispatch for ``torch.Tensor.view``.
-    Changes the shape of the current tensor.
-    """
-    assert isinstance(self, ColoTensor)
-    # apply original `view` function for replicated colo tensors
-    if self.is_replicate():
-        return self.view(*shape)
-
-    cur_sp = self.size()
-    org_sp = self.size_global()
-    # parse the passed arguments
-    tgt_sp = _get_valid_shape(shape)
-    # get the correct shape from inference
-    inf_sp = _shape_infer(org_sp, tgt_sp)
-
-    if self.is_shard_1drow() and org_sp[0] == inf_sp[0]:
-        new_shape = (cur_sp[0],) + tgt_sp[1:]
-        res = self.view(*new_shape)
-    elif self.is_shard_1dcol() and org_sp[-1] == inf_sp[-1]:
-        new_shape = tgt_sp[:-1] + (cur_sp[-1],)
-        res = self.view(*new_shape)
-    else:
-        replicated_t = self.redistribute(dist_spec=ReplicaSpec())
-        return ColoTensor.from_torch_tensor(tensor=replicated_t.view(*shape),
-                                            spec=ColoTensorSpec(self.get_process_group()))
-
-    return ColoTensor.from_torch_tensor(tensor=res,
-                                        spec=ColoTensorSpec(pg=self.get_process_group(), dist_attr=self.dist_spec))
-
-
-@colo_op_impl(torch.Tensor.size)
-def colo_size(self: ColoTensor, dim: Optional[int] = None) -> Union[torch.Size, int]:
-    size = self.size_global()
-    if dim is None:
-        return size
-    else:
-        return size[dim]
--- a/colossalai/legacy/nn/layer/base_layer.py
+++ b/colossalai/legacy/nn/layer/base_layer.py
@ -5,8 +5,8 @@ from contextlib import contextmanager

 import torch.nn as nn

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc


 class ParallelLayer(nn.Module):
--- a/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py
@ -1,6 +1,6 @@
 import torch.nn as nn

-from colossalai.context import ParallelMode, seed
+from colossalai.legacy.context import ParallelMode, seed

 from ..parallel_1d import *
 from ..utils import get_tensor_parallel_mode
--- a/colossalai/legacy/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
@ -1,7 +1,7 @@
 import torch
 import torch.distributed as dist

-from colossalai.core import global_context as gpc
+from colossalai.legacy.core import global_context as gpc

 try:
    import fused_mix_prec_layer_norm_cuda
--- a/colossalai/legacy/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py
@ -4,8 +4,8 @@
 import torch
 import torch.distributed as dist

-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env

 from ..utils import divide

--- a/colossalai/legacy/nn/layer/parallel_1d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/layers.py
@ -10,18 +10,18 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn.parameter import Parameter

-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
 from colossalai.legacy.communication import broadcast
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.context.parallel_context import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn import init as init
-from colossalai.utils.checkpointing import (
+from colossalai.legacy.utils.checkpointing import (
    broadcast_state_dict,
    gather_tensor_parallel_state_dict,
    partition_tensor_parallel_state_dict,
 )
+from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device

 from ..base_layer import ParallelLayer
--- a/colossalai/legacy/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py
@ -5,10 +5,10 @@ import torch.distributed as dist
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.utils import get_current_device


@ -31,9 +31,9 @@ def matmul_2d(
        out_shape (:class:`torch.size`): shape of output tensor.
        row_rank (int, optional): the rank of row, defaults to None.
        col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`, optional):
            row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`, optional):
            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.

    Returns:
@ -146,8 +146,8 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
        out_shape (:class:`torch.size`): shape of output tensor.
        row_rank (int, optional): the rank of row, defaults to None.
        col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -172,8 +172,8 @@ class Matmul_AB_2D(torch.autograd.Function):
        out_shape (:class:`torch.size`): shape of output tensor.
        row_rank (int, optional): the rank of row, defaults to None.
        col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -299,8 +299,8 @@ class Matmul_ABT_2D(torch.autograd.Function):
        out_shape (:class:`torch.size`): shape of output tensor.
        row_rank (int, optional): the rank of row, defaults to None.
        col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
            column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
@ -433,8 +433,8 @@ class Matmul_ATB_2D(torch.autograd.Function):
        out_shape (:class:`torch.size`): shape of output tensor.
        row_rank (int, optional): the rank of row, defaults to None.
        col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -620,8 +620,8 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
        output_size_per_partition (int): size of output per partition.
        row_rank (int, optional): the rank of row, defaults to None.
        col_rank (int, optional): the rank of column, defaults to None.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        skip_bias_add (bool):
            If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
        data_parallel_rank (int): data parallel rank.
@ -685,8 +685,8 @@ def layernorm_2d(input_: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int, r
        E_x (:class:`torch.tensor`): mean.
        Var_x (:class:`torch.tensor`): variance.
        hidden_size (int): hidden size.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -719,7 +719,7 @@ def all_gather_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMode)
    Args:
        tensor (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to gather.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -767,7 +767,7 @@ def reduce_tensor_2d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:

    Args:
        input_ (:class:`torch.tensor`): Input tensor.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -795,7 +795,7 @@ def reduce_scatter_tensor_2d(tensor: Tensor, dim: int, parallel_mode: ParallelMo
    Args:
        tensor (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to reduce.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
--- a/colossalai/legacy/nn/layer/parallel_2d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/_utils.py
@ -1,6 +1,6 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env


 def get_summa_dim_from_env() -> int:
--- a/colossalai/legacy/nn/layer/parallel_2d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2d/layers.py
@ -8,13 +8,16 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter

-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import broadcast
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import LAYERS
+from colossalai.legacy.utils.checkpointing import (
+    gather_tensor_parallel_state_dict,
+    partition_tensor_parallel_state_dict,
+)
 from colossalai.nn import init as init
-from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
 from colossalai.utils.cuda import get_current_device

 from ..base_layer import ParallelLayer
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py
@ -5,9 +5,9 @@ import torch.distributed as dist
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.utils import get_current_device


@ -112,8 +112,8 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
        out_shape (:class:`torch.size`): shape of output tensor.
        row_rank (int): the rank of row.
        col_rank (int): the rank of column.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -139,8 +139,8 @@ class Matmul_AB_2p5D(torch.autograd.Function):
        row_rank (int): the rank of row.
        col_rank (int): the rank of column.
        dep_rank (int): the rank of depth.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -264,8 +264,8 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
        row_rank (int): the rank of row.
        col_rank (int): the rank of column.
        dep_rank (int): the rank of depth.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -394,8 +394,8 @@ class Matmul_ATB_2p5D(torch.autograd.Function):
        row_rank (int): the rank of row.
        col_rank (int): the rank of column.
        dep_rank (int): the rank of depth.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        data_parallel_rank (int): data parallel rank.
        pipeline_parallel_rank (int): pipeline parallel rank
        pipeline_parallel_size (int): pipeline parallel size.
@ -606,7 +606,7 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
        row_rank (int): the rank of row.
        col_rank (int): the rank of column.
        dep_rank (int): the rank of depth.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.
        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
            which is preserved for kernel fusion.
        data_parallel_rank (int): data parallel rank.
@ -631,7 +631,7 @@ class _Layernorm2p5D(torch.autograd.Function):
        E_x (:class:`torch.tensor`): mean.
        Var_x (:class:`torch.tensor`): variance.
        hidden_size (int): hidden size.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -682,7 +682,7 @@ def layernorm_2p5d(input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
        E_x (:class:`torch.tensor`): mean.
        Var_x (:class:`torch.tensor`): variance.
        hidden_size (int): hidden size.
-        row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
+        row_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): row parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -715,7 +715,7 @@ def all_gather_tensor_2p5d(inputs: Tensor, dim: int, col_parallel_mode: Parallel
    Args:
        inputs (:class:`torch.tensor`): input tensor.
        dim (int): dimension of all-gather.
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -730,7 +730,7 @@ class SplitFirst(torch.autograd.Function):
    Args:
        inputs (:class:`torch.tensor`): input tensor.
        tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
-        col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
+        col_parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): column parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -798,7 +798,7 @@ def reduce_tensor_2p5d(input_: Tensor, parallel_mode: ParallelMode) -> Tensor:

    Args:
        input_ (:class:`torch.tensor`): Input tensor.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -826,7 +826,7 @@ def reduce_scatter_tensor_2p5d(input_: Tensor, dim: int, parallel_mode: Parallel
    Args:
        input_ (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to reduce.
-        parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
+        parallel_mode (:class:`colossalai.legacy.context.ParallelMode`): The parallel mode tensor used.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
--- a/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py
@ -1,6 +1,6 @@
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env


 def get_tesseract_dim_dep_from_env():
--- a/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py
@ -8,17 +8,17 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter

-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import broadcast
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn import init as init
-from colossalai.utils.checkpointing import (
+from colossalai.legacy.utils.checkpointing import (
    broadcast_state_dict,
    gather_tensor_parallel_state_dict,
    partition_tensor_parallel_state_dict,
 )
+from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device

 from ..base_layer import ParallelLayer
--- a/colossalai/legacy/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py
@ -7,10 +7,10 @@ import torch
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
+from colossalai.legacy.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc

 from ._utils import get_parallel_mode_from_env, push_async_grad

@ -73,9 +73,9 @@ def linear_3d(
    Args:
        input_ (:class:`torch.tensor`): input matrix.
        weight (:class:`torch.tensor`): matrix of weight.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -166,9 +166,9 @@ def classifier_3d(
        input_ (:class:`torch.tensor`): input matrix.
        weight (:class:`torch.tensor`): matrix of weight.
        bias (:class:`torch.tensor`): matrix of bias.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -260,9 +260,9 @@ def vocab_parallel_classifier_3d(
        input_ (:class:`torch.tensor`): input matrix.
        weight (:class:`torch.tensor`): matrix of weight.
        bias (:class:`torch.tensor`): matrix of bias.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -378,8 +378,8 @@ def layernorm_3d(
            If a single integer is used, it is treated as a singleton list, and this module will
            normalize over the last dimension which is expected to be of that specific size.
        eps (float): a value added to the denominator for numerical stability
-        output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
-        input_x_weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input x weight parallel mode.
+        output_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): output parallel mode.
+        input_x_weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input x weight parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -404,7 +404,7 @@ def split_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode) -> Te
    Args:
        tensor (:class:`torch.tensor`): Input tensor.
        dim (int): Specified dimension in which to split.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): Parallel mode.

    Returns:
        :class:`torch.tensor`: The tensor has been split.
@ -434,8 +434,8 @@ def split_batch_3d(input_: Tensor,
    Args:
        input_ (:class:`torch.tensor`): Input tensor.
        dim (int): Specified dimension in which to split.
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`, optional): weight parallel mode.

    Returns:
        :class:`torch.tensor`: The tensor has been split.
@ -471,7 +471,7 @@ def reduce_tensor_3d(tensor: Tensor, parallel_mode: ParallelMode) -> Tensor:

    Args:
        tensor (:class:`torch.tensor`): Input tensor.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -501,7 +501,7 @@ def all_gather_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMode)
    Args:
        tensor (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to gather.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -530,7 +530,7 @@ def reduce_scatter_tensor_3d(tensor: Tensor, dim: int, parallel_mode: ParallelMo
    Args:
        tensor (:class:`torch.tensor`): Input tensor.
        dim (int): Dimension to scatter.
-        parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
+        parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): Parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -578,8 +578,8 @@ def reduce_by_batch_3d(tensor: Tensor,
    r"""All-reduce the input from the model parallel region.

    Args:
-        input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
-        weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
+        input_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): input parallel mode.
+        weight_parallel_mode (:class:`colossalai.legacy.context.parallel_mode.ParallelMode`): weight parallel mode.
        reduce_mean (bool, optional): If set to ``True``, it will divide the output by
            (input parallel size * weight parallel size), default to False.

--- a/colossalai/legacy/nn/layer/parallel_3d/_utils.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/_utils.py
@ -4,9 +4,15 @@ from functools import partial
 import torch
 from torch import Tensor

-from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.constants import (
+    INPUT_GROUP_3D,
+    INPUT_X_WEIGHT_3D,
+    OUTPUT_GROUP_3D,
+    OUTPUT_X_WEIGHT_3D,
+    WEIGHT_GROUP_3D,
+)
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env


 def get_depth_from_env() -> int:
--- a/colossalai/legacy/nn/layer/parallel_3d/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py
@ -8,19 +8,25 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter

-from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.legacy.communication import all_reduce, broadcast
+from colossalai.legacy.constants import (
+    INPUT_GROUP_3D,
+    INPUT_X_WEIGHT_3D,
+    OUTPUT_GROUP_3D,
+    OUTPUT_X_WEIGHT_3D,
+    WEIGHT_GROUP_3D,
+)
+from colossalai.legacy.context import ParallelMode, seed
+from colossalai.legacy.core import global_context as gpc
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.base_layer import ParallelLayer
 from colossalai.legacy.registry import LAYERS
-from colossalai.nn import init as init
-from colossalai.utils.checkpointing import (
+from colossalai.legacy.utils.checkpointing import (
    broadcast_state_dict,
    gather_tensor_parallel_state_dict,
    partition_tensor_parallel_state_dict,
 )
+from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device

 from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
--- a/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py
@ -5,9 +5,9 @@ import torch
 from torch import distributed as dist
 from torch.cuda.amp import custom_bwd, custom_fwd

-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.legacy.communication import ring_forward
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
 from colossalai.utils import get_current_device

--- a/colossalai/legacy/nn/layer/parallel_sequence/layers.py
+++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py
@ -9,11 +9,11 @@ import torch.nn.functional as F
 from torch.nn import Parameter

 import colossalai
-from colossalai.context import seed
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
 from colossalai.kernel import FusedScaleMaskSoftmax
 from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
+from colossalai.legacy.context import seed
+from colossalai.legacy.context.parallel_mode import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
 from colossalai.legacy.registry import LAYERS

--- a/colossalai/legacy/nn/layer/utils/common.py
+++ b/colossalai/legacy/nn/layer/utils/common.py
@ -8,9 +8,9 @@ import numpy as np
 import torch
 from torch import Tensor, nn

-from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
-from colossalai.global_variables import tensor_parallel_env as env
-from colossalai.utils import checkpoint
+from colossalai.legacy.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
+from colossalai.legacy.global_variables import tensor_parallel_env as env
+from colossalai.legacy.utils import checkpoint


 class CheckpointModule(nn.Module):
--- a/colossalai/legacy/nn/layer/vanilla/layers.py
+++ b/colossalai/legacy/nn/layer/vanilla/layers.py
@ -7,7 +7,7 @@ from torch import Tensor
 from torch import nn as nn
 from torch.nn.parameter import Parameter

-from colossalai.context import seed
+from colossalai.legacy.context import seed
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.cuda import get_current_device
@ -64,7 +64,7 @@ class WrappedDropout(nn.Module):
    Args:
        p (float, optional): probability of an element to be zeroed, defaults 0.5.
        inplace (bool, optional): whether to do dropout in-place, default to be False.
-        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@ -101,7 +101,7 @@ class WrappedDropPath(nn.Module):

    Args:
        p (float, optional): probability of dropping path, defaults 0.0.
-        mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        mode (:class:`colossalai.legacy.context.ParallelMode`): The chosen parallel mode.

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
--- a/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py
@ -3,8 +3,8 @@ from typing import List, Tuple, Union
 import torch.distributed as dist
 import torch.nn as nn

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc


 class PipelineSharedModuleWrapper:
--- a/colossalai/legacy/nn/loss/init.py
+++ b/colossalai/legacy/nn/loss/init.py
@ -2,7 +2,7 @@ from torch import nn
 from torch.nn.modules.loss import *
 from torch.nn.modules.loss import _Loss

-from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.global_variables import tensor_parallel_env as env
 from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode

 from .loss_1d import VocabParallelCrossEntropyLoss1D
--- a/colossalai/legacy/nn/loss/loss_1d.py
+++ b/colossalai/legacy/nn/loss/loss_1d.py
@ -3,8 +3,8 @@ import torch.distributed as dist
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.modules.loss import _Loss

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.registry import LOSSES


--- a/colossalai/legacy/nn/loss/loss_2d.py
+++ b/colossalai/legacy/nn/loss/loss_2d.py
@ -4,8 +4,8 @@ from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
+from colossalai.legacy.context import ParallelMode
+from colossalai.legacy.core import global_context as gpc
 from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
 from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.legacy.registry import LOSSES
--- a/Show More
+++ b/Show More