From 8accecd55bf1a5aaaeb4b84c06fac0d63850fd5e Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 4 Sep 2023 11:33:40 +0800
Subject: [PATCH] [legacy] move engine to legacy (#4560)

* [legacy] move engine to legacy

* [example] fix seq parallel example

* [example] fix seq parallel example

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [example] update seq parallel requirements
---
 colossalai/builder/builder.py                 |  2 +-
 colossalai/initialize.py                      |  6 +-
 colossalai/{ => legacy}/engine/__init__.py    |  0
 .../{ => legacy}/engine/_base_engine.py       | 12 ++-
 .../engine/gradient_accumulation/__init__.py  |  4 +-
 .../_gradient_accumulation.py                 |  4 +-
 .../engine/gradient_handler/__init__.py       |  0
 .../_base_gradient_handler.py                 |  0
 .../_data_parallel_gradient_handler.py        |  2 +-
 .../gradient_handler/_moe_gradient_handler.py |  2 +-
 .../_pipeline_parallel_gradient_handler.py    |  0
 .../_sequence_parallel_gradient_handler.py    |  2 +-
 .../_zero_gradient_handler.py                 |  0
 .../engine/gradient_handler/utils.py          |  0
 .../{ => legacy}/engine/schedule/__init__.py  |  0
 .../engine/schedule/_base_schedule.py         |  2 +-
 .../engine/schedule/_non_pipeline_schedule.py |  2 +-
 .../engine/schedule/_pipeline_schedule.py     | 10 +--
 .../engine/schedule/_pipeline_schedule_v2.py  |  2 +-
 colossalai/legacy/trainer/_trainer.py         |  2 +-
 colossalai/utils/profiler/profiler.py         | 18 ++---
 .../profiler/stateful_tensor_mem_extention.py |  8 +-
 .../advanced_tutorials/add_your_parallel.md   |  7 +-
 .../train_gpt_using_hybrid_parallelism.md     |  2 +-
 .../train_vit_using_pipeline_parallelism.md   |  2 +-
 .../train_vit_with_hybrid_parallelism.md      |  2 +-
 docs/source/en/features/gradient_handler.md   |  3 +-
 .../advanced_tutorials/add_your_parallel.md   |  7 +-
 .../train_gpt_using_hybrid_parallelism.md     |  2 +-
 .../train_vit_using_pipeline_parallelism.md   |  2 +-
 .../train_vit_with_hybrid_parallelism.md      |  2 +-
 .../zh-Hans/features/gradient_handler.md      |  3 +-
 .../data/datasets/indexed_dataset.py          | 77 +++++++------------
 .../sequence_parallel/requirements.txt        |  1 +
 examples/tutorial/sequence_parallel/train.py  |  2 +-
 .../test_plugin/test_gemini_plugin.py         |  2 +-
 tests/test_moe/test_grad_handler.py           |  2 +-
 tests/test_moe/test_moe_zero_model.py         |  2 +-
 tests/test_moe/test_moe_zero_optim.py         |  2 +-
 39 files changed, 93 insertions(+), 105 deletions(-)
 rename colossalai/{ => legacy}/engine/__init__.py (100%)
 rename colossalai/{ => legacy}/engine/_base_engine.py (97%)
 rename colossalai/{ => legacy}/engine/gradient_accumulation/__init__.py (94%)
 rename colossalai/{ => legacy}/engine/gradient_accumulation/_gradient_accumulation.py (98%)
 rename colossalai/{ => legacy}/engine/gradient_handler/__init__.py (100%)
 rename colossalai/{ => legacy}/engine/gradient_handler/_base_gradient_handler.py (100%)
 rename colossalai/{ => legacy}/engine/gradient_handler/_data_parallel_gradient_handler.py (94%)
 rename colossalai/{ => legacy}/engine/gradient_handler/_moe_gradient_handler.py (97%)
 rename colossalai/{ => legacy}/engine/gradient_handler/_pipeline_parallel_gradient_handler.py (100%)
 rename colossalai/{ => legacy}/engine/gradient_handler/_sequence_parallel_gradient_handler.py (94%)
 rename colossalai/{ => legacy}/engine/gradient_handler/_zero_gradient_handler.py (100%)
 rename colossalai/{ => legacy}/engine/gradient_handler/utils.py (100%)
 rename colossalai/{ => legacy}/engine/schedule/__init__.py (100%)
 rename colossalai/{ => legacy}/engine/schedule/_base_schedule.py (98%)
 rename colossalai/{ => legacy}/engine/schedule/_non_pipeline_schedule.py (97%)
 rename colossalai/{ => legacy}/engine/schedule/_pipeline_schedule.py (98%)
 rename colossalai/{ => legacy}/engine/schedule/_pipeline_schedule_v2.py (98%)

diff --git a/colossalai/builder/builder.py b/colossalai/builder/builder.py
index 4a9076013..a14509392 100644
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -71,7 +71,7 @@ def build_gradient_handler(config, model, optimizer):
         optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
 
     Returns:
-        An object of :class:`colossalai.engine.BaseGradientHandler`
+        An object of :class:`colossalai.legacy.engine.BaseGradientHandler`
     """
     config_ = config.copy()
     config_['model'] = model
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index dc0df0517..32354dde8 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -21,9 +21,9 @@ from colossalai.builder.builder import build_gradient_handler
 from colossalai.context import Config, ConfigException, ParallelMode
 from colossalai.context.moe_context import MOE_CONTEXT
 from colossalai.core import global_context as gpc
-from colossalai.engine import Engine
-from colossalai.engine.gradient_accumulation import accumulate_gradient
-from colossalai.engine.schedule import (
+from colossalai.legacy.engine import Engine
+from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
+from colossalai.legacy.engine.schedule import (
     InterleavedPipelineSchedule,
     NonPipelineSchedule,
     PipelineSchedule,
diff --git a/colossalai/engine/__init__.py b/colossalai/legacy/engine/__init__.py
similarity index 100%
rename from colossalai/engine/__init__.py
rename to colossalai/legacy/engine/__init__.py
diff --git a/colossalai/engine/_base_engine.py b/colossalai/legacy/engine/_base_engine.py
similarity index 97%
rename from colossalai/engine/_base_engine.py
rename to colossalai/legacy/engine/_base_engine.py
index db27ad0e8..9af4469f4 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/legacy/engine/_base_engine.py
@@ -8,11 +8,17 @@ from torch import Tensor
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 
-from colossalai.engine.gradient_handler import BaseGradientHandler
-from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
+from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
+from colossalai.legacy.engine.schedule import (
+    BaseSchedule,
+    InterleavedPipelineSchedule,
+    NonPipelineSchedule,
+    PipelineSchedule,
+)
 from colossalai.logging import get_dist_logger
-from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
 from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
+
 
 class Engine:
     """Basic engine class for training and evaluation. It runs a specific process method
diff --git a/colossalai/engine/gradient_accumulation/__init__.py b/colossalai/legacy/engine/gradient_accumulation/__init__.py
similarity index 94%
rename from colossalai/engine/gradient_accumulation/__init__.py
rename to colossalai/legacy/engine/gradient_accumulation/__init__.py
index 4cb6f4ad7..670c26d06 100644
--- a/colossalai/engine/gradient_accumulation/__init__.py
+++ b/colossalai/legacy/engine/gradient_accumulation/__init__.py
@@ -4,7 +4,7 @@ import torch.nn as nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 
 from ._gradient_accumulation import (
     GradAccumDataloader,
@@ -33,7 +33,7 @@ def accumulate_gradient(model: nn.Module,
         dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
             your dataloader object, would be called like iter(dataloader)
         accumulate_size (int): the number of steps to accumulate gradients
-        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+        gradient_handlers (List[:class:`colossalai.legacy.engine.BaseGradientHandler`]):
             list of gradient handler objects. Default is None.
         lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
             your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
diff --git a/colossalai/engine/gradient_accumulation/_gradient_accumulation.py b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
similarity index 98%
rename from colossalai/engine/gradient_accumulation/_gradient_accumulation.py
rename to colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
index cf66be1cd..c466f7e2d 100644
--- a/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/legacy/engine/gradient_accumulation/_gradient_accumulation.py
@@ -10,7 +10,7 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
 
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 from colossalai.nn.optimizer import ColossalaiOptimizer
 from colossalai.utils import conditional_context
 
@@ -262,7 +262,7 @@ class GradAccumGradientHandler:
     before accumulation size is reached.
 
     Args:
-        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+        grad_handler (:class:`colossalai.legacy.engine.BaseGradientHandler`):
             Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
         accumulate_size (int): The number of steps to accumulate gradients.
 
diff --git a/colossalai/engine/gradient_handler/__init__.py b/colossalai/legacy/engine/gradient_handler/__init__.py
similarity index 100%
rename from colossalai/engine/gradient_handler/__init__.py
rename to colossalai/legacy/engine/gradient_handler/__init__.py
diff --git a/colossalai/engine/gradient_handler/_base_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_base_gradient_handler.py
similarity index 100%
rename from colossalai/engine/gradient_handler/_base_gradient_handler.py
rename to colossalai/legacy/engine/gradient_handler/_base_gradient_handler.py
diff --git a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
similarity index 94%
rename from colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
rename to colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
index 5cc7169c5..d0196e3c4 100644
--- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_data_parallel_gradient_handler.py
@@ -1,7 +1,7 @@
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
 
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
 
diff --git a/colossalai/engine/gradient_handler/_moe_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
similarity index 97%
rename from colossalai/engine/gradient_handler/_moe_gradient_handler.py
rename to colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
index b499345d4..f2db95752 100644
--- a/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_moe_gradient_handler.py
@@ -1,9 +1,9 @@
 from colossalai.context.moe_context import MOE_CONTEXT
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
 from colossalai.utils.moe import get_moe_epsize_param_dict
 
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
 
diff --git a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
similarity index 100%
rename from colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
rename to colossalai/legacy/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
diff --git a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
similarity index 94%
rename from colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
rename to colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
index ea4f0fbb1..f13568094 100644
--- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/legacy/engine/gradient_handler/_sequence_parallel_gradient_handler.py
@@ -1,7 +1,7 @@
+from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
 
-from ...context.parallel_mode import ParallelMode
 from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
 
diff --git a/colossalai/engine/gradient_handler/_zero_gradient_handler.py b/colossalai/legacy/engine/gradient_handler/_zero_gradient_handler.py
similarity index 100%
rename from colossalai/engine/gradient_handler/_zero_gradient_handler.py
rename to colossalai/legacy/engine/gradient_handler/_zero_gradient_handler.py
diff --git a/colossalai/engine/gradient_handler/utils.py b/colossalai/legacy/engine/gradient_handler/utils.py
similarity index 100%
rename from colossalai/engine/gradient_handler/utils.py
rename to colossalai/legacy/engine/gradient_handler/utils.py
diff --git a/colossalai/engine/schedule/__init__.py b/colossalai/legacy/engine/schedule/__init__.py
similarity index 100%
rename from colossalai/engine/schedule/__init__.py
rename to colossalai/legacy/engine/schedule/__init__.py
diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/legacy/engine/schedule/_base_schedule.py
similarity index 98%
rename from colossalai/engine/schedule/_base_schedule.py
rename to colossalai/legacy/engine/schedule/_base_schedule.py
index a2d500411..7505a3eb2 100644
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/legacy/engine/schedule/_base_schedule.py
@@ -95,7 +95,7 @@ class BaseSchedule(ABC):
         """The process function over a batch of dataset for training or evaluation.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
             forward_only (bool): If True, the process won't include backward.
             return_loss (bool, optional): If False, the loss won't be returned.
diff --git a/colossalai/engine/schedule/_non_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_non_pipeline_schedule.py
similarity index 97%
rename from colossalai/engine/schedule/_non_pipeline_schedule.py
rename to colossalai/legacy/engine/schedule/_non_pipeline_schedule.py
index b9239d928..b67893c1a 100644
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_non_pipeline_schedule.py
@@ -54,7 +54,7 @@ class NonPipelineSchedule(BaseSchedule):
         The returned labels and loss will None if :attr:`return_loss` is False.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
             forward_only (bool, optional):
                 If True, the model is run for the forward pass, else back propagation will be executed.
diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
similarity index 98%
rename from colossalai/engine/schedule/_pipeline_schedule.py
rename to colossalai/legacy/engine/schedule/_pipeline_schedule.py
index 9fc301a26..88b54ce6a 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py
@@ -236,7 +236,7 @@ class PipelineSchedule(BaseSchedule):
         Returns output tensor. This is a helper function and can be ignored by users.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
             return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
             return_output_label (bool, optional): Whether returns output labels.
@@ -274,7 +274,7 @@ class PipelineSchedule(BaseSchedule):
         This is a helper function and can be ignored by users.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): input tensor for this pipeline stage.
             output_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): output tensor for this pipeline stage.
             output_obj_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): gradient of output tensor for this pipeline stage.
@@ -314,7 +314,7 @@ class PipelineSchedule(BaseSchedule):
         Returns a tuple with losses if the last stage, an empty tuple otherwise.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
             forward_only (bool, optional):
                 Whether run forward step only. Default is false. If true, no backward will be run.
@@ -518,7 +518,7 @@ class InterleavedPipelineSchedule(PipelineSchedule):
         Returns output tensor. This is a helper function and can be ignored by users.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             model_chunk_id (int): The id of model chunks.
             input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
             return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
@@ -555,7 +555,7 @@ class InterleavedPipelineSchedule(PipelineSchedule):
         communication between pipeline stages as needed.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
             forward_only (bool, optional):
                 Whether run forward step only. Default is false. If true, no backward will be run.
diff --git a/colossalai/engine/schedule/_pipeline_schedule_v2.py b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
similarity index 98%
rename from colossalai/engine/schedule/_pipeline_schedule_v2.py
rename to colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
index 89e45c7aa..9e7372b67 100644
--- a/colossalai/engine/schedule/_pipeline_schedule_v2.py
+++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py
@@ -69,7 +69,7 @@ class PipelineScheduleV2(PipelineSchedule):
         Returns a tuple with losses if the last stage, an empty tuple otherwise.
 
         Args:
-            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
+            engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
             data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
             forward_only (bool, optional):
                 Whether run forward step only. Default is false. If true, no backward will be run.
diff --git a/colossalai/legacy/trainer/_trainer.py b/colossalai/legacy/trainer/_trainer.py
index fb66acec5..1847e5622 100644
--- a/colossalai/legacy/trainer/_trainer.py
+++ b/colossalai/legacy/trainer/_trainer.py
@@ -4,7 +4,7 @@ import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-from colossalai.engine import Engine
+from colossalai.legacy.engine import Engine
 from colossalai.legacy.trainer.hooks import BaseHook
 from colossalai.logging import DistributedLogger
 from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
diff --git a/colossalai/utils/profiler/profiler.py b/colossalai/utils/profiler/profiler.py
index 8f43a0b96..3026d723d 100644
--- a/colossalai/utils/profiler/profiler.py
+++ b/colossalai/utils/profiler/profiler.py
@@ -1,17 +1,17 @@
-import os
-from typing import List
-from colossalai.engine import Engine
-from torch.profiler import profile as torch_profile
-from torch.profiler.profiler import ProfilerAction
-from typing import Any, Callable, Iterable, Optional
-from torch.autograd import ProfilerActivity
+import gzip
 import json
 import os
 import tempfile
-import gzip
+from typing import Any, Callable, Iterable, List, Optional
+
+from torch.autograd import ProfilerActivity
+from torch.profiler import profile as torch_profile
+from torch.profiler.profiler import ProfilerAction
+
+from colossalai.legacy.engine import Engine
+from colossalai.logging import get_dist_logger
 from colossalai.utils.profiler.extention import ProfilerExtension
 from colossalai.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention
-from colossalai.logging import get_dist_logger
 
 
 class profile(torch_profile):
diff --git a/colossalai/utils/profiler/stateful_tensor_mem_extention.py b/colossalai/utils/profiler/stateful_tensor_mem_extention.py
index 127055c8c..412bd7277 100644
--- a/colossalai/utils/profiler/stateful_tensor_mem_extention.py
+++ b/colossalai/utils/profiler/stateful_tensor_mem_extention.py
@@ -1,12 +1,14 @@
 import os
 import threading
 import time
-import torch
 from enum import Enum
 from typing import List
-from colossalai.gemini.stateful_tensor import StatefulTensor
+
+import torch
+
 from colossalai.gemini.ophooks import BaseOpHook
-from colossalai.engine import Engine
+from colossalai.gemini.stateful_tensor import StatefulTensor
+from colossalai.legacy.engine import Engine
 from colossalai.utils.profiler.extention import ProfilerExtension
 
 
diff --git a/docs/source/en/advanced_tutorials/add_your_parallel.md b/docs/source/en/advanced_tutorials/add_your_parallel.md
index 1caf58c87..cda49af47 100644
--- a/docs/source/en/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/en/advanced_tutorials/add_your_parallel.md
@@ -92,14 +92,14 @@ follow the steps below to create a new distributed initialization.
 
 Gradient handlers are objects which execute the all-reduce operations on parameters' gradients. As different all-reduce
 strategies may be executed for different kinds of parallelism, users can
-inherit `colossalai.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
+inherit `colossalai.legacy.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
 uses the normal data parallel gradient handler which all-reduces the gradients across data parallel ranks. The data
 parallel gradient handler is added to the engine automatically if data parallel is detected. You can add your own
 gradient handler like below:
 
 ```python
 from colossalai.registry import GRADIENT_HANDLER
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 
 @GRADIENT_HANDLER.register_module
 class YourGradientHandler(BaseGradientHandler):
@@ -121,4 +121,5 @@ gradient_handlers = [
 
 Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline
 schedules. If you want to modify how the forward and backward passes are executed, you can
-inherit `colossalai.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
+inherit `colossalai.legacy.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index 24aa2610f..98c16e922 100644
--- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -39,7 +39,7 @@ from colossalai.amp import AMP_TYPE
 from colossalai.builder.pipeline import partition_uniform
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
diff --git a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
index 3475d8f07..370931d87 100644
--- a/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -35,7 +35,7 @@ import colossalai.nn as col_nn
 import torch
 import torch.nn as nn
 from colossalai.builder import build_pipeline_model
-from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.legacy.trainer import Trainer, hooks
diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 5b0b694b3..fc1101c5a 100644
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -415,7 +415,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw
 
 #### Import modules
 ```python
-from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.utils import MultiTimer
 import os
diff --git a/docs/source/en/features/gradient_handler.md b/docs/source/en/features/gradient_handler.md
index 757016fcb..14ced32b8 100644
--- a/docs/source/en/features/gradient_handler.md
+++ b/docs/source/en/features/gradient_handler.md
@@ -29,7 +29,7 @@ To implement a customized gradient handler, you need to follow these steps.
 
 ```python
 from colossalai.registry import GRADIENT_HANDLER
-from colossalai.engine.gradient_handler import BaseGradientHandler
+from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 
 
 @GRADIENT_HANDLER.register_module
@@ -61,3 +61,4 @@ to demonstrate the use of gradient handler. In this example, we used `DataParall
 ```shell
 python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500  train_with_engine.py
 ```
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
index 059eb014a..abfe058c6 100644
--- a/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
+++ b/docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
@@ -81,14 +81,14 @@ Colossal-AI 为用户提供了一个全局 context，使他们能够轻松地管
 ## 梯度 Handler
 
 梯度 handler 是对参数的梯度执行 all-reduce 操作的对象。由于不同的 all-reduce 策略或许在不同的并行中被执行，用户可以继承
-`colossalai.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前，Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
+`colossalai.legacy.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前，Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
 如果数据并行被检测到，梯度 handler 会被自动添加进 engine。
 
 你可以添加你自己的梯度 handler，如下所示：
 
 ```python
 from colossalai.registry import GRADIENT_HANDLER
-from colossalai.engine import BaseGradientHandler
+from colossalai.legacy.engine import BaseGradientHandler
 
 @GRADIENT_HANDLER.register_module
 class YourGradientHandler(BaseGradientHandler):
@@ -109,4 +109,5 @@ gradient_handlers = [
 ## Schedule
 
 Schedule 包含了如何执行前向和后向计算。目前， Colossal-AI 提供了流水和非流水的 schedule。
-如果你想修改前向和后向计算的执行方式，你可以继承 `colossalai.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
+如果你想修改前向和后向计算的执行方式，你可以继承 `colossalai.legacy.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
index a199d31e7..84b48165b 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -39,7 +39,7 @@ from colossalai.amp import AMP_TYPE
 from colossalai.builder.pipeline import partition_uniform
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
index d3a98c89b..1ac01c207 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_using_pipeline_parallelism.md
@@ -33,7 +33,7 @@ import colossalai.nn as col_nn
 import torch
 import torch.nn as nn
 from colossalai.builder import build_pipeline_model
-from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.legacy.trainer import Trainer, hooks
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index ddc2502f0..650bab105 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -380,7 +380,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw
 
 #### 导入模块
 ```python
-from colossalai.engine.schedule import (InterleavedPipelineSchedule,
+from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
                                         PipelineSchedule)
 from colossalai.utils import MultiTimer
 import os
diff --git a/docs/source/zh-Hans/features/gradient_handler.md b/docs/source/zh-Hans/features/gradient_handler.md
index 701c60fed..b08dd6806 100644
--- a/docs/source/zh-Hans/features/gradient_handler.md
+++ b/docs/source/zh-Hans/features/gradient_handler.md
@@ -26,7 +26,7 @@
 
 ```python
 from colossalai.registry import GRADIENT_HANDLER
-from colossalai.engine.gradient_handler import BaseGradientHandler
+from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
 
 
 @GRADIENT_HANDLER.register_module
@@ -57,3 +57,4 @@ gradient_handler = [dict(type='MyGradientHandler')]
 ```shell
 python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500  train_with_engine.py
 ```
+<!-- doc-test-command: echo  -->
diff --git a/examples/tutorial/sequence_parallel/data/datasets/indexed_dataset.py b/examples/tutorial/sequence_parallel/data/datasets/indexed_dataset.py
index b4febcd82..9a25dc453 100644
--- a/examples/tutorial/sequence_parallel/data/datasets/indexed_dataset.py
+++ b/examples/tutorial/sequence_parallel/data/datasets/indexed_dataset.py
@@ -3,17 +3,16 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 # copied from fairseq/fairseq/data/indexed_dataset.py
 # Removed IndexedRawTextDataset since it relied on Fairseq dictionary
 # other slight modifications to remove fairseq dependencies
 # Added document index to index file and made it accessible.
 #    An empty sentence no longer separates documents.
 
-from functools import lru_cache
 import os
 import shutil
 import struct
+from functools import lru_cache
 from itertools import accumulate
 
 import numpy as np
@@ -88,16 +87,7 @@ def write_longs(f, a):
     f.write(np.array(a, dtype=np.int64))
 
 
-dtypes = {
-    1: np.uint8,
-    2: np.int8,
-    3: np.int16,
-    4: np.int32,
-    5: np.int64,
-    6: np.float,
-    7: np.double,
-    8: np.uint16
-}
+dtypes = {1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: float, 7: np.double, 8: np.uint16}
 
 
 def code(dtype):
@@ -136,10 +126,8 @@ class IndexedDataset(torch.utils.data.Dataset):
     def read_index(self, path):
         with open(index_file_path(path), 'rb') as f:
             magic = f.read(8)
-            assert magic == self._HDR_MAGIC, (
-                'Index file doesn\'t match expected format. '
-                'Make sure that --dataset-impl is configured properly.'
-            )
+            assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. '
+                                              'Make sure that --dataset-impl is configured properly.')
             version = f.read(8)
             assert struct.unpack('<Q', version) == (1,)
             code, self.element_size = struct.unpack('<QQ', f.read(16))
@@ -198,13 +186,11 @@ class IndexedDataset(torch.utils.data.Dataset):
 
     @staticmethod
     def exists(path):
-        return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
-        )
+        return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
 
     @property
     def supports_prefetch(self):
-        return False  # avoid prefetching to save memory
+        return False    # avoid prefetching to save memory
 
 
 class IndexedCachedDataset(IndexedDataset):
@@ -233,7 +219,7 @@ class IndexedCachedDataset(IndexedDataset):
         for i in indices:
             self.cache_index[i] = ptx
             size = self.data_offsets[i + 1] - self.data_offsets[i]
-            a = self.cache[ptx: ptx + size]
+            a = self.cache[ptx:ptx + size]
             self.data_file.seek(self.data_offsets[i] * self.element_size)
             self.data_file.readinto(a)
             ptx += size
@@ -250,7 +236,7 @@ class IndexedCachedDataset(IndexedDataset):
             tensor_size = self.sizes[self.dim_offsets[i]:self.dim_offsets[i + 1]]
             a = np.empty(tensor_size, dtype=self.dtype)
             ptx = self.cache_index[i]
-            np.copyto(a, self.cache[ptx: ptx + a.size])
+            np.copyto(a, self.cache[ptx:ptx + a.size])
             return a
         elif isinstance(idx, slice):
             # Hack just to make this work, can optimizer later if necessary
@@ -261,15 +247,7 @@ class IndexedCachedDataset(IndexedDataset):
 
 
 class IndexedDatasetBuilder(object):
-    element_sizes = {
-        np.uint8: 1,
-        np.int8: 1,
-        np.int16: 2,
-        np.int32: 4,
-        np.int64: 8,
-        np.float: 4,
-        np.double: 8
-    }
+    element_sizes = {np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, float: 4, np.double: 8}
 
     def __init__(self, out_file, dtype=np.int32):
         self.out_file = open(out_file, 'wb')
@@ -332,12 +310,15 @@ def _warmup_mmap_file(path):
 
 
 class MMapIndexedDataset(torch.utils.data.Dataset):
+
     class Index(object):
         _HDR_MAGIC = b'MMIDIDX\x00\x00'
 
         @classmethod
         def writer(cls, path, dtype):
+
             class _Writer(object):
+
                 def __enter__(self):
                     self._file = open(path, 'wb')
 
@@ -384,10 +365,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
         def __init__(self, path, skip_warmup=False):
             with open(path, 'rb') as stream:
                 magic_test = stream.read(9)
-                assert self._HDR_MAGIC == magic_test, (
-                    'Index file doesn\'t match expected format. '
-                    'Make sure that --dataset-impl is configured properly.'
-                )
+                assert self._HDR_MAGIC == magic_test, ('Index file doesn\'t match expected format. '
+                                                       'Make sure that --dataset-impl is configured properly.')
                 version = struct.unpack('<Q', stream.read(8))
                 assert (1,) == version
 
@@ -406,16 +385,16 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
             self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
             self._bin_buffer = memoryview(self._bin_buffer_mmap)
             print("    reading sizes...")
-            self._sizes = np.frombuffer(
-                self._bin_buffer,
-                dtype=np.int32,
-                count=self._len,
-                offset=offset)
+            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
             print("    reading pointers...")
-            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+            self._pointers = np.frombuffer(self._bin_buffer,
+                                           dtype=np.int64,
+                                           count=self._len,
                                            offset=offset + self._sizes.nbytes)
             print("    reading document index...")
-            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
+            self._doc_idx = np.frombuffer(self._bin_buffer,
+                                          dtype=np.int64,
+                                          count=self._doc_count,
                                           offset=offset + self._sizes.nbytes + self._pointers.nbytes)
 
         def __del__(self):
@@ -480,8 +459,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
     def __getitem__(self, idx):
         if isinstance(idx, int):
             ptr, size = self._index[idx]
-            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                     count=size, offset=ptr)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
             return np_array
         elif isinstance(idx, slice):
             start, stop, step = idx.indices(len(self))
@@ -491,8 +469,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
             sizes = self._index._sizes[idx]
             offsets = list(accumulate(sizes))
             total_size = sum(sizes)
-            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                     count=total_size, offset=ptr)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
             sents = np.split(np_array, offsets[:-1])
             return sents
 
@@ -506,8 +483,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
         if length is None:
             length = size - offset
         ptr += offset * np.dtype(self._index.dtype).itemsize
-        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
-                                 count=length, offset=ptr)
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr)
         return np_array
 
     @property
@@ -530,12 +506,11 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
 
     @staticmethod
     def exists(path):
-        return (
-            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
-        )
+        return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
 
 
 class MMapIndexedDatasetBuilder(object):
+
     def __init__(self, out_file, dtype=np.int64):
         self._data_file = open(out_file, 'wb')
         self._dtype = dtype
diff --git a/examples/tutorial/sequence_parallel/requirements.txt b/examples/tutorial/sequence_parallel/requirements.txt
index b49a94554..4fc576453 100644
--- a/examples/tutorial/sequence_parallel/requirements.txt
+++ b/examples/tutorial/sequence_parallel/requirements.txt
@@ -1,2 +1,3 @@
 colossalai
 torch
+six
diff --git a/examples/tutorial/sequence_parallel/train.py b/examples/tutorial/sequence_parallel/train.py
index a89747b58..86c4edeb5 100644
--- a/examples/tutorial/sequence_parallel/train.py
+++ b/examples/tutorial/sequence_parallel/train.py
@@ -11,8 +11,8 @@ import colossalai
 from colossalai.amp import AMP_TYPE
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine.schedule import PipelineSchedule
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.engine.schedule import PipelineSchedule
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import FusedAdam
 from colossalai.utils import MultiTimer, is_using_pp
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 4fc67bd29..23561f8ae 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -98,7 +98,7 @@ def check_gemini_plugin(subset: str, init_method: str = 'none', early_stop: bool
         ]:
             continue
         err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
-
+        torch.cuda.empty_cache()
         if err is None:
             passed_models.append(name)
         else:
diff --git a/tests/test_moe/test_grad_handler.py b/tests/test_moe/test_grad_handler.py
index e7002a75f..9c84a99cd 100644
--- a/tests/test_moe/test_grad_handler.py
+++ b/tests/test_moe/test_grad_handler.py
@@ -5,7 +5,7 @@ import torch.nn as nn
 
 import colossalai
 from colossalai.context.moe_context import MOE_CONTEXT
-from colossalai.engine.gradient_handler import MoeGradientHandler
+from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
 from colossalai.nn.layer.moe import Experts, MoeLayer, Top1Router, UniformNoiseGenerator
 from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
 from colossalai.utils import get_current_device
diff --git a/tests/test_moe/test_moe_zero_model.py b/tests/test_moe/test_moe_zero_model.py
index ec37967f1..595d4374d 100644
--- a/tests/test_moe/test_moe_zero_model.py
+++ b/tests/test_moe/test_moe_zero_model.py
@@ -3,7 +3,7 @@ import torch
 
 import colossalai
 from colossalai.context import MOE_CONTEXT
-from colossalai.engine.gradient_handler import MoeGradientHandler
+from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
 from colossalai.nn import MoeLoss
 from colossalai.testing import assert_equal_in_group, parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.zero.legacy.init_ctx import ZeroInitContext
diff --git a/tests/test_moe/test_moe_zero_optim.py b/tests/test_moe/test_moe_zero_optim.py
index efc6e9dda..a43ae764d 100644
--- a/tests/test_moe/test_moe_zero_optim.py
+++ b/tests/test_moe/test_moe_zero_optim.py
@@ -4,7 +4,7 @@ import torch
 import colossalai
 from colossalai.amp import convert_to_apex_amp
 from colossalai.context import MOE_CONTEXT
-from colossalai.engine.gradient_handler import MoeGradientHandler
+from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
 from colossalai.nn import MoeLoss
 from colossalai.nn.optimizer import CPUAdam
 from colossalai.testing import assert_equal_in_group, parameterize, rerun_if_address_is_in_use, spawn