diff --git a/README.md b/README.md
index bbf0f7638..1d8b81e98 100644
--- a/README.md
+++ b/README.md
@@ -14,10 +14,12 @@ Blog: [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Trai
 pip install colossalai
 ```
 
-### Install From Source
+### Install From Source (Recommended)
+
+> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)
 
 ```shell
-git clone git@github.com:hpcaitech/ColossalAI.git
+git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 # install dependency
 pip install -r requirements/requirements.txt
@@ -64,8 +66,8 @@ model = ...
 # sampler by default
 train_dataset = ... 
 train_dataloader = get_dataloader(dataset=dataset,
-                            shuffle=True,
-                            )
+                                shuffle=True,
+                                )
 
 
 # build your 
diff --git a/colossalai/amp/__init__.py b/colossalai/amp/__init__.py
index 268eced66..5a30e67fb 100644
--- a/colossalai/amp/__init__.py
+++ b/colossalai/amp/__init__.py
@@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
                    criterion: _Loss,
                    mode: AMP_TYPE,
                    amp_config: Config = None):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param mode: amp mode
+    :type mode: :class:`colossalai.amp.AMP_TYPE`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
     assert isinstance(mode, AMP_TYPE), \
         f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
 
diff --git a/colossalai/amp/apex_amp/__init__.py b/colossalai/amp/apex_amp/__init__.py
index 2d0ff9771..23cffae0f 100644
--- a/colossalai/amp/apex_amp/__init__.py
+++ b/colossalai/amp/apex_amp/__init__.py
@@ -7,6 +7,18 @@ import apex.amp as apex_amp
 def convert_to_apex_amp(model: nn.Module,
                         optimizer: Optimizer,
                         amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for nvidia apex
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
     model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
     optimizer = ApexAMPOptimizer(optimizer)
     return model, optimizer
diff --git a/colossalai/amp/apex_amp/apex_amp.py b/colossalai/amp/apex_amp/apex_amp.py
index d44478364..3c106153d 100644
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32
 
 
 class ApexAMPOptimizer(ColossalaiOptimizer):
+    ''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
+    methods
+    '''
 
     def backward(self, loss: Tensor):
+        """
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
         with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
             scaled_loss.backward()
 
     def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: the max norm value for gradient clipping
+        :type max_norm: float
+        """
         if max_norm > 0:
             clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
diff --git a/colossalai/amp/naive_amp/__init__.py b/colossalai/amp/naive_amp/__init__.py
index e3a49c7e8..08ae7b62a 100644
--- a/colossalai/amp/naive_amp/__init__.py
+++ b/colossalai/amp/naive_amp/__init__.py
@@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 def convert_to_naive_amp(model: nn.Module,
                          optimizer: Optimizer,
                          amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for naive mode amp
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
     if is_no_pp_or_last_stage():
         model = NaiveAMPModel(model, output_to_fp32=True)
     else:
diff --git a/colossalai/amp/naive_amp/_fp16_optimizer.py b/colossalai/amp/naive_amp/_fp16_optimizer.py
index d917a97bc..9ac8543c1 100644
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@@ -146,26 +146,22 @@ class DynamicGradScaler:
 class FP16Optimizer(Optimizer):
     """Float16 optimizer for fp16 and bf16 data types.
 
-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a contihuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
+    :param optimizer: base optimizer such as Adam or SGD
+    :type optimizer: torch.optim.Optimizer
+    :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
+    :type param clip_grad: float
+    :param log_num_zeros_in_grad: return number of zeros in the gradients.
+    :type log_num_zeros_in_grad: bool
+    :param initial_scale: initial scale of gradient scaler
+    :type initial_scale: int
+    :param growth_factor: the growth rate of loss scale
+    :type growth_factor: int
+    :param backoff_factor: the decrease rate of loss scale
+    :type backoff_factor: float
+    :param hysterisis: delay shift in dynamic loss scaling
+    :type hysterisis: int
+    :param max_scale: maximum loss scale allowed
+    :type max_scale: int
     """
 
     def __init__(self,
diff --git a/colossalai/amp/naive_amp/naive_amp.py b/colossalai/amp/naive_amp/naive_amp.py
index dd0b88b44..8fc1b109b 100644
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer
 
 
 class NaiveAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class for optimizer to cast all parameters to fp16
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """
 
     def __init__(self, optim: Optimizer, *args, **kwargs):
         optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
         super().__init__(optim)
 
     def backward(self, loss: Tensor):
+        """backward with gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
         loss = self.optim.scale_loss(loss)
         loss.backward()
 
@@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
 
 
 class NaiveAMPModel(nn.Module):
+    """A wrapper class for model to cast the model into fp16 and 
+    automatically cast the input and output
+    """
 
     def __init__(self,
                  model: nn.Module,
diff --git a/colossalai/amp/torch_amp/__init__.py b/colossalai/amp/torch_amp/__init__.py
index b3c5b0c5b..af8d34904 100644
--- a/colossalai/amp/torch_amp/__init__.py
+++ b/colossalai/amp/torch_amp/__init__.py
@@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
                          optimizer: Optimizer,
                          criterion: _Loss,
                          amp_config: Config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+    
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
     model = TorchAMPModel(model)
     optimizer = TorchAMPOptimizer(optimizer, **amp_config)
     criterion = TorchAMPLoss(criterion)
diff --git a/colossalai/amp/torch_amp/torch_amp.py b/colossalai/amp/torch_amp/torch_amp.py
index 396360184..b323a25c5 100644
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32
 
 
 class TorchAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class which integrate pytorch amp with an optimizer
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """
 
     def __init__(self, optim: Optimizer, *args, **kwargs):
         super().__init__(optim)
         self.scaler = GradScaler(*args, **kwargs)
 
     def backward(self, loss: Tensor):
+        """backward with torch amp gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
         self.scaler.scale(loss).backward()
 
     def step(self):
+        """update the parameters of the model
+        """
         self.scaler.step(self.optim)
         self.scaler.update()
 
     def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """apply gradient clipping to the model parameters
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: max norm value for gradient clipping
+        :type max_norm: float
+        """
         if max_norm > 0.0:
             self.scaler.unscale_(self.optim)
             clip_grad_norm_fp32(model.parameters(), max_norm)
 
 
 class TorchAMPModel(nn.Module):
+    """A wrapper class for a model object which executes forward with values automatically
+    cast to fp16
+    """
 
     def __init__(self, model: nn.Module) -> None:
         super().__init__()
@@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):
 
 
 class TorchAMPLoss(nn.Module):
-
+    """A wrapper class for a criterion object which computes the loss in mixed-precision context
+    :param loss: a loss function object
+    :type loss: torch.nn.modules.loss._Loss
+    """
     def __init__(self, loss: _Loss):
         super().__init__()
         self.loss = loss
diff --git a/colossalai/builder/builder.py b/colossalai/builder/builder.py
index 6e8e24551..71971321a 100644
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -16,8 +16,8 @@ def build_from_config(module, config: dict):
         of the return object
     :type config: dict
     :raises AssertionError: Raises an AssertionError if `module` is not a class
-    :return: An object of :class:`module`
-    :rtype: :class:`module`
+    :return: An object of interest
+    :rtype: Object
     """
     assert inspect.isclass(module), 'module must be a class'
     return module(**config)
@@ -62,8 +62,8 @@ def build_layer(config):
     :param config: A python dict or a :class:`colossalai.context.Config` object
         containing information used in the construction of the return object
     :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`nn.Module`
-    :rtype: :class:`nn.Module`
+    :return: An object of :class:`torch.nn.Module`
+    :rtype: :class:`torch.nn.Module`
     """
     return build_from_registry(config, LAYERS)
 
@@ -75,8 +75,8 @@ def build_loss(config):
     :param config: A python dict or a :class:`colossalai.context.Config` object
         containing information used in the construction of the return object
     :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.autograd.Function`
-    :rtype: :class:`torch.autograd.Function`
+    :return: An object of :class:`torch.nn.modules.loss._Loss`
+    :rtype: :class:`torch.nn.modules.loss._Loss`
     """
     return build_from_registry(config, LOSSES)
 
@@ -87,8 +87,8 @@ def build_model(config):
     :param config: A python dict or a :class:`colossalai.context.Config` object
         containing information used in the construction of the return object
     :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`nn.Module`
-    :rtype: :class:`nn.Module`
+    :return: An object of :class:`torch.nn.Module`
+    :rtype: :class:`torch.nn.Module`
     """
     return build_from_registry(config, MODELS)
 
@@ -134,8 +134,8 @@ def build_gradient_handler(config, model, optimizer):
     :type model: :class:`nn.Module`
     :param optimizer: An optimizer object containing parameters for the gradient handler
     :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`BaseGradientHandler`
-    :rtype: :class:`BaseGradientHandler`
+    :return: An object of :class:`colossalai.engine.BaseGradientHandler`
+    :rtype: :class:`colossalai.engine.BaseGradientHandler`
     """
     config_ = config.copy()
     config_['model'] = model
@@ -151,8 +151,8 @@ def build_hooks(config, trainer):
     :type config: dict or :class:`colossalai.context.Config`
     :param trainer: A :class:`Trainer` object containing parameters for the hook
     :type trainer: :class:`Trainer`
-    :return: An object of :class:`BaseHook`
-    :rtype: :class:`BaseHook`
+    :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
+    :rtype: :class:`colossalai.trainer.hooks.BaseHook`
     """
     config_ = config.copy()
     config_['trainer'] = trainer
@@ -182,8 +182,8 @@ def build_data_sampler(config, dataset):
     :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
         used in the construction of the return object
     :type dataset: :class:`torch.utils.data.Dataset`
-    :return: An object of :class:`colossalai.nn.data.sampler.BaseSampler`
-    :rtype: :class:`colossalai.nn.data.sampler.BaseSampler`
+    :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
+    :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
     """
     config_ = config.copy()
     config_['dataset'] = dataset
@@ -200,10 +200,6 @@ def build_lr_scheduler(config, optimizer):
     :param optimizer: An optimizer object containing parameters for the learning rate
         scheduler
     :type optimizer: :class:`torch.optim.Optimizer`
-    :param total_steps: Number of total steps of the learning rate scheduler
-    :type total_steps: int
-    :param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
-    :type num_steps_per_epoch: int
     :return: An object of :class:`torch.optim.lr_scheduler`
     :rtype: :class:`torch.optim.lr_scheduler`
     """
diff --git a/colossalai/builder/pipeline.py b/colossalai/builder/pipeline.py
index 5a568a909..a859030a7 100644
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@@ -151,6 +151,28 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):
 
 
 class PipelineModelInitializer():
+    """An intializer to split the model into different stages for pipeline parallelism.
+
+    An example for the model config is shown below. The class VisionTransformerFromConfig should
+    inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
+    of layer configurations.
+
+    model_config = dict(
+        type='VisionTransformerFromConfig',
+        embedding_cfg=dict(...),
+        ...
+    )
+
+    :param config: configuration of the model
+    :type config: dict
+    :param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
+                        in most cases unless you are using virutal pipeline parallelism.
+    :type num_chunks: int
+    :param verbose: whether to print the logs
+    :type verbose: bool
+
+    """
+
     def __init__(self, config, num_chunks, verbose=False):
         self.num_chunks = num_chunks
         self.ori_model = build_model(config)
@@ -161,6 +183,13 @@ class PipelineModelInitializer():
         self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])
 
     def initialize(self, partition_method='parameter'):
+        """Initialize the model object from the config passed
+
+        :param partition_method: this parameter determines how you want to split your model layers into stages,
+                                you can set it as 'layer' or 'parameter'
+        :type partition_method: str
+        
+        """
         # Some space for initializing comunication groups
         self._interval = None
         self._partition_layers(method=partition_method)
@@ -183,7 +212,7 @@ class PipelineModelInitializer():
             # print_rank_0(param_counts)
             self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
         else:
-            assert method == 'layer', "Method should be a pre-set string"
+            raise ValueError("Method should be a pre-set string in [layer, parameter]")
 
         # Display the partition
         if gpc.get_global_rank() == 0 and self.verbose:
diff --git a/colossalai/communication/collective.py b/colossalai/communication/collective.py
index 5778028ea..e216cf17f 100644
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
@@ -18,11 +18,11 @@ def all_gather(tensor: Tensor, dim: int,
     :param tensor: Tensor to be gathered
     :param dim: The dimension concatenating in
     :param parallel_mode: Parallel group mode used in this communication
-    :type tensor: Tensor
+    :type tensor: :class:`torch.Tensor`
     :type dim: int
-    :type parallel_mode: ParallelMode
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
     :return: The tensor generated by all-gather
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     depth = gpc.get_world_size(parallel_mode)
     temp = tensor.clone()
@@ -54,11 +54,11 @@ def reduce_scatter(tensor: Tensor, dim: int,
     :param tensor: Tensor to be reduced and scattered
     :param dim: The dimension scattering in
     :param parallel_mode: Parallel group mode used in this communication
-    :type tensor: Tensor
+    :type tensor: :class:`torch.Tensor`
     :type dim: int
-    :type parallel_mode: ParallelMode
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
     :return: The tensor generated by reduce-scatter
-    :rtype: Tensor
+    :rtype: :class:`Tensor`
     """
     depth = gpc.get_world_size(parallel_mode)
     # temp = list(torch.chunk(tensor, depth, dim=dim))
diff --git a/colossalai/communication/p2p.py b/colossalai/communication/p2p.py
index 3eb94ac60..1d0009d6a 100644
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@@ -96,7 +96,7 @@ def recv_forward(input_tensor_shape, prev_rank=None):
     :type input_tensor_shape: torch.Size
     :type prev_rank: int, optional
     :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     if gpc.is_first_rank(ParallelMode.PIPELINE):
         input_tensor = None
@@ -115,7 +115,7 @@ def recv_backward(output_grad_shape, next_rank=None):
     :type output_grad_shape: torch.Size
     :type next_rank: int, optional
     :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     if gpc.is_last_rank(ParallelMode.PIPELINE):
         output_tensor_grad = None
@@ -131,7 +131,7 @@ def send_forward(output_tensor, next_rank=None):
 
     :param output_tensor: Tensor to be sent
     :param next_rank: The rank of the recipient of the tensor
-    :type output_tensor: Tensor
+    :type output_tensor: :class:`torch.Tensor`
     :type next_rank: int, optional
     """
     if not gpc.is_last_rank(ParallelMode.PIPELINE):
@@ -144,7 +144,7 @@ def send_backward(input_tensor_grad, prev_rank=None):
 
     :param input_tensor_grad: Tensor to be sent
     :param prev_rank: The rank of the recipient of the tensor
-    :type input_tensor_grad: Tensor
+    :type input_tensor_grad: :class:`torch.Tensor`
     :type prev_rank: int, optional
     """
     if not gpc.is_first_rank(ParallelMode.PIPELINE):
@@ -162,10 +162,10 @@ def send_forward_recv_backward(output_tensor,
 
     :param output_tensor: Tensor to be sent
     :param output_grad_shape: The shape of the tensor to be recieved
-    :type output_tensor: Tensor
-    :type output_grad_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type output_grad_shape: :class:`torch.Size`
     :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     if gpc.is_last_rank(ParallelMode.PIPELINE):
         output_tensor_grad = None
@@ -187,10 +187,10 @@ def send_backward_recv_forward(input_tensor_grad,
 
     :param input_tensor_grad: Tensor to be sent
     :param input_tensor_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: Tensor
-    :type input_tensor_shape: torch.Size
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
     :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     if gpc.is_first_rank(ParallelMode.PIPELINE):
         input_tensor = None
@@ -213,10 +213,10 @@ def send_forward_recv_forward(output_tensor,
 
     :param output_tensor: Tensor to be sent
     :param input_tensor_shape: The shape of the tensor to be recieved
-    :type output_tensor: Tensor
-    :type input_tensor_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
     :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     input_tensor, _ = _communicate(tensor_send_next=output_tensor,
                                    recv_prev=recv_prev,
@@ -237,10 +237,10 @@ def send_backward_recv_backward(input_tensor_grad,
 
     :param input_tensor_grad: Tensor to be sent
     :param output_grad_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: Tensor
-    :type output_grad_shape: torch.Size
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type output_grad_shape: :class:`torch.Size`
     :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
                                          recv_next=recv_next,
@@ -266,10 +266,10 @@ def send_forward_backward_recv_forward_backward(output_tensor,
     :param input_tensor_grad: Tensor sent to the previous
     :param input_tensor_shape: The shape of the tensor recieved from the previous
     :param output_grad_shape: The shape of the tensor recieved from the next
-    :type output_tensor: Tensor
-    :type input_tensor_grad: Tensor
-    :type input_tensor_shape: torch.Size
-    :type output_grad_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
+    :type output_grad_shape: :class:`torch.Size`
     :return: (the input tensor in forward step, the grad of output tensor in forward step)
     :rtype: (Tensor, Tensor)
     """
diff --git a/colossalai/communication/ring.py b/colossalai/communication/ring.py
index d1b4266a6..6f42e90ab 100644
--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
@@ -14,10 +14,10 @@ def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
 
     :param tensor_send_next: Tensor sent to next member
     :param parallel_mode: Parallel group mode used in this communication
-    :type tensor_send_next: Tensor
-    :type parallel_mode: ParallelMode
+    :type tensor_send_next: :class:`torch.Tensor`
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
     :return: The tensor recieved from the previous
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
     """
     buffer_shape = tensor_send_next.size()
 
diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index 4f8e9f807..9d7d311c3 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -433,6 +433,9 @@ class ParallelContext:
 
     def set_device(self, device_ordinal: int = None):
         """Sets distributed processes to be bound to devices.
+
+        :param device_ordinal: the device id to be bound to
+        :type device_ordinal: int
         """
         global_rank = self.get_global_rank()
         if device_ordinal is None:
@@ -445,6 +448,9 @@ class ParallelContext:
 
     def set_seed(self, seed: int):
         """Sets seeds for all random libraries.
+
+        :param seed: seed for random states
+        :type seed: int
         """
         random.seed(seed)
         np.random.seed(seed)
diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index 8a3f6eac3..90a7f0730 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -57,38 +57,61 @@ class Engine:
 
     @property
     def model(self):
+        """model attached to the engine"""
         return self._model
 
     @property
     def optimizer(self):
+        """optimizer attached to the engine"""
         return self._optimizer
 
     @property
     def criterion(self):
+        """criterion attached to the engine"""
         return self._criterion
 
-    @property
-    def schedule(self):
-        return self._schedule
-
     def zero_grad(self):
+        """set the gradient of parameters to zero
+        """
         self.optimizer.zero_grad()
 
     def step(self):
+        """execute parameter update
+        """
         self._all_reduce_gradients()
         self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
         self.optimizer.step()
 
     def backward(self, loss: Tensor):
+        """Start backward propagation given the loss value computed by a loss function
+        
+        :param loss: loss value computed by a loss function
+        :type loss: :class:`torch.Tensor`
+        """
         return self.optimizer.backward(loss)
 
     def backward_by_grad(self, tensor, grad):
+        """Start backward propagation given the gradient of the output tensor
+        
+        :param loss: output tensor
+        :type loss: :class:`torch.Tensor`
+        :param grad: gradient passed back to the output
+        :type grad: :class:`torch.Tensor`
+        """
         return self.optimizer.backward_by_grad(tensor, grad)
 
     def calc_loss(self, *args, **kwargs):
+        """compute the loss value
+        :return: the loss value
+        :rtype: :class:`torch.Tensor`
+        """
         return self.criterion(*args, **kwargs)
 
     def __call__(self, *args, **kwargs):
+        """run the forward step for the model
+        :return: output the model
+        :rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
+        """
         return self.model(*args, **kwargs)
 
     def _all_reduce_gradients(self):
diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/engine/schedule/_base_schedule.py
index e28690cb0..aceee4e6c 100644
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -48,7 +48,7 @@ class BaseSchedule(ABC):
         already in the same GPU as where the model's.
 
         :return: (data, label)
-        :rtype: (Tensor, Tensor)
+        :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
         """
         if data_iter is None:
             raise RuntimeError('Dataloader is not defined.')
diff --git a/colossalai/engine/schedule/_non_pipeline_schedule.py b/colossalai/engine/schedule/_non_pipeline_schedule.py
index 01e681941..0d8ee8c69 100644
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@@ -38,7 +38,9 @@ class NonPipelineSchedule(BaseSchedule):
         :type data_iter: Iterator
         :type forward_only: bool, optional
         :type return_loss: bool, optional
+        
         :return: (output, label, loss)
+        :rtype: Tuple[:class:`torch.Tensor`]
         """
         assert forward_only or return_loss, \
             "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py
index f0bc04427..c637622a1 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -133,6 +133,16 @@ class PipelineSchedule(BaseSchedule):
         """Forward step for passed-in model. If it is the first stage, the input tensor 
         is obtained from data_iterator, otherwise the passed-in input_tensor is used.
         Returns output tensor. This is a helper function and can be ignored by users.
+
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param input_tensor: input tensor for this pipeline stage
+        :type input_tensor: :class:`torch.Tensor`
+        :param return_tensors: a list of tensors to return
+        :type return_tensors: List[:class:`torch.Tensor`]
+        
+        :return: output or the loss value of the current pipeline stage
+        :rtype: :class:`torch.Tensor`
         """
 
         if input_tensor is None:
@@ -162,6 +172,18 @@ class PipelineSchedule(BaseSchedule):
         output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
         Returns the gradients with respect to the input tensor (None if first stage).
         This is a helper function and can be ignored by users.
+
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param input_tensor: input tensor for this pipeline stage
+        :type input_tensor: :class:`torch.Tensor`
+        :param output_tensor: output tensor for this pipeline stage
+        :type output_tensor: :class:`torch.Tensor`
+        :param output_tensor_grad: gradient of output tensor for this pipeline stage
+        :type output_tensor_grad: :class:`torch.Tensor`
+
+        :return: gradient of input tensor
+        :rtype: :class:`torch.Tensor`
         """
 
         # Retain the grad on the input_tensor.
@@ -189,7 +211,17 @@ class PipelineSchedule(BaseSchedule):
         """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
         Returns a tuple with losses if the last stage, an empty tuple otherwise.
 
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
+        :type data_iter: Iterable
+        :param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
+        :type forward_only: bool
+        :param return_loss: whether returns the loss value. Default is true.
+        :type return_loss: bool
+
         :return: (output, label, loss)
+        :rtype: Tuple[:class:`torch.Tensor`]
         """
 
         assert forward_only or return_loss, \
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 4401dd9d0..8817daf8c 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -82,6 +82,8 @@ def launch(config: Union[str, Path, Config, Dict],
     :param local_rank: rank for the process on the node and is used to set the default CUDA device,
     defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
     :type local_rank: int, optional
+    :param verbose: whether to print logs
+    :type verbose: bool
     :raises Exception: raise exception when config type is wrong
     '''
     gpc.verbose = verbose
@@ -121,6 +123,20 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
                       backend: str = 'nccl',
                       seed: int = 1024,
                       verbose: bool = True):
+    '''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
+    set by SLURM
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
     rank = int(os.environ['SLURM_PROCID'])
     world_size = int(os.environ['SLURM_NPROCS'])
     launch(config=config,
@@ -139,6 +155,20 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
                         backend: str = 'nccl',
                         seed: int = 1024,
                         verbose: bool = True):
+    '''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
+    set by OpenMPI
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
     rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
     local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
     world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@@ -159,6 +189,20 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
                       backend: str = 'nccl',
                       seed: int = 1024,
                       verbose: bool = True):
+    '''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size 
+    from the environment variables set by PyTorch
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
     rank = int(os.environ['RANK'])
     local_rank = int(os.environ['LOCAL_RANK'])
     world_size = int(os.environ['WORLD_SIZE'])
@@ -184,16 +228,20 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
     ''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
 
     :param model: your model instance
-    :type model: a single or a list of ``torch.nn.Module`` objects
+    :type model: :class:`torch.nn.Module`
     :param optimizer: your optimizer instance
-    :type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects
+    :type optimizer: :class:`torch.optim.optimizer.Optimizer`
     :param criterion: your criterion instance
-    :type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects
-    :param train_dataloader: dataloaders for training data
-    :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
-    :param train_dataloader: dataloaders for testing data
-    :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
-    :return: (engine, criterion, train_dataloader, test_dataloader)
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param train_dataloader: dataloader for training data
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`
+    :param train_dataloader: dataloader for testing data
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`
+    :param lr_scheduler: your lr scheduler instance
+    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
+    :param verbose: whether to print logs
+    :type verbose: bool
+    :return: (engine, train_dataloader, test_dataloader, lr_scheduler)
     :rtype: tuple
     '''
     # get logger
diff --git a/colossalai/logging/__init__.py b/colossalai/logging/__init__.py
index 5ee86c45f..1950ad8e9 100644
--- a/colossalai/logging/__init__.py
+++ b/colossalai/logging/__init__.py
@@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
 def get_dist_logger(name='root'):
     """Get logger instance based on name. The DistributedLogger will create singleton instances,
     which means that only one logger instance is created per name.
+
+    :param name: name of the logger, name must be unique
+    :type name: str
+
+    :return: a distributed logger instance
+    :rtype: :class:`colossalai.logging.DistributedLogger`
     """
     return DistributedLogger.get_instance(name=name)
diff --git a/colossalai/nn/layer/non_parallel_layers/_vit.py b/colossalai/nn/layer/non_parallel_layers/_vit.py
index 59a12fee2..730cb472a 100644
--- a/colossalai/nn/layer/non_parallel_layers/_vit.py
+++ b/colossalai/nn/layer/non_parallel_layers/_vit.py
@@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
 @LAYERS.register_module
 class VanillaViTPatchEmbedding(nn.Module):
     """ 2D Image to Patch Embedding
+
+    :param img_size: image size
+    :type img_size: int
+    :param patch_size: size of a patch
+    :type patch_size: int
+    :param in_chans: input channels
+    :type in_chans: int
+    :param embed_dim: embedding dimension
+    :type embed_dim: int
+    :param norm_layer: layer norm class, defaults to None
+    :type norm_layer: Callable
+    :param flattern: whether flatten the output
+    :type flatten: bool
+    :param drop: dropout rate
+    :type drop: float
     """
 
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.):
+    def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
         super().__init__()
         img_size = to_2tuple(img_size)
         patch_size = to_2tuple(patch_size)
@@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
 @LAYERS.register_module
 class VanillaViTMLP(nn.Module):
     """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+
+    :param in_features: input channels
+    :type in_features: int
+    :param hidden_features: channels of the output of the first dense layer
+    :type hidden_features: int
+    :param hidden_features: channels of the output of the second dense layer
+    :type hidden_features: int
+    :param act_layer: activation function
+    :type act_layer: Callable
+    :param drop: dropout rate
+    :type drop: float
+
     """
 
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+    def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
         super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
         self.fc1 = nn.Linear(in_features, hidden_features)
         self.act = act_layer()
         self.fc2 = nn.Linear(hidden_features, out_features)
@@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
     changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
     'survival rate' as the argument.
 
+    :param drop_prob: probability for dropout
+    :type drop_prob: float
+    :param training: whether it is training mode
+    :type training: bool
+
     """
     if drop_prob == 0. or not training:
         return x
@@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
 @LAYERS.register_module
 class VanillaViTDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+
+    :param drop_prob: probability for dropout
+    :type drop_path: float
     """
 
     def __init__(self, drop_prob=0.):
@@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
 
     :param dim: dimension of input tensor
     :type dim: int
-    :param num_heads: number of attention heads, defaults to 8
+    :param num_heads: number of attention heads
     :type num_heads: int, optional
     :param qkv_bias: enable bias for qkv if True, defaults to False
     :type qkv_bias: bool, optional
@@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
     :type proj_drop: float, optional
     """
 
-    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index ed4523c75..2baa41a49 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     added functionality to handle model parallel parameters. Note that
     the gradients are modified in place.
 
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
+    :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
+    :type parameters: (Iterable[Tensor] or Tensor)
+    :param max_norm: max norm of the gradients
+    :type max_norm: float or int
+    :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
+    :type norm_type: float or int 
 
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
+    :return: Total norm of the parameters (viewed as a single vector).
+    :rtype: float
     """
 
     if isinstance(parameters, torch.Tensor):
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
index 0e7400345..a6c2c7480 100644
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -123,12 +123,23 @@ def get_dataloader(dataset,
         stage and label on the last stage
 
     :param dataset: a :class:utils.data.dataset dataset
+    :param shuffle: whether to shuffle the dataset
     :param seed: random worker seed, defaults to 1024
-    :type seed: int, optional
-    :param add_sampler_if_possible: [description], defaults to False
-    :type add_sampler_if_possible: bool, optional
-    :return: a :class:utils.data.dataset dataloader
-    :rtype: torch.utils.data.dataset
+    :param add_sampler: add DistributedDataParallelSampelr to the dataset
+    :param drop_last: drop the last incomplete batch of data
+    :param pin_memory: whether to pin memory address in CPU memory
+    :param num_workers: number of worker threads for this dataloader
+
+    :type dataset: :class:`torch.utils.data.Dataset`
+    :type shuffle: bool, optional. Default is False
+    :type seed: int, optional. Default is 1024
+    :type add_sampler: bool, optional. Default is True
+    :type drop_last: bool, optional. Default is False
+    :type pin_memory: bool, optional. Default is False
+    :type num_workers: int, optional. Default is 0
+
+    :return: a object of :class:`torch.utils.data.DataLoader`
+    :rtype: :class:`torch.utils.data.DataLoader`
     '''
     _kwargs = kwargs.copy()
 
diff --git a/colossalai/utils/gradient_accumulation/__init__.py b/colossalai/utils/gradient_accumulation/__init__.py
index 342f360c1..4c4bf3438 100644
--- a/colossalai/utils/gradient_accumulation/__init__.py
+++ b/colossalai/utils/gradient_accumulation/__init__.py
@@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
                         accumulate_size: int,
                         gradient_handlers: List[BaseGradientHandler] = None,
                         lr_scheduler: _LRScheduler = None):
+    """
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumulate_size: int
+    :param gradient_handlers: list of gradient handler objects. Default is None
+    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
+    :param lr_scheduler: your lr scheduler object. Default is None
+    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    """
     optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
     dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
 
diff --git a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
index 0aa25188a..8c159c628 100644
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
 
 
 class GradAccumOptimizer(ColossalaiOptimizer):
+    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param optim: your optimizer object
+    :type optim: :class:`torch.optim.Optimizer`
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+    :param model: your model object to check if it is DDP for special handling of no_sync() context
+    :type model: :class:`torch.nn.Module`
+
+    """
 
     def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
         super().__init__(optim)
@@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
 
 
 class GradAccumDataloader():
+    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+
+    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
+    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
+    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+    
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """
 
     def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
         self.dataloader = dataloader
@@ -99,6 +123,15 @@ class GradAccumDataloader():
 
 
 class GradAccumLrSchedulerByStep(_LRScheduler):
+    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param lr_scheduler: your lr scheduler object
+    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """
 
     def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
         self.lr_scheduler = lr_scheduler
@@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
 
 
 class GradAccumGradientHandler():
+    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param grad_handler: your gradient handler object
+    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """
 
     def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
         assert isinstance(grad_handler, BaseGradientHandler), \
diff --git a/colossalai/utils/memory.py b/colossalai/utils/memory.py
index 904ec894b..a71ffc4ba 100644
--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
@@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
 
     :param message: a prefix message to add in the log
     :type message: str
+    :param logger: an instance of :class:`colossalai.logging.DistributedLogger`
+    :type logger: :class:`colossalai.logging.DistributedLogger`
+    :param report_cpu: whether to report CPU memory
+    :type report_cpu: bool
     :raises EnvironmentError: raise error if no distributed environment has been initialized
     '''
     if not gpc.is_initialized(ParallelMode.GLOBAL):
diff --git a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
index b81fb5f1c..48a1e4b98 100644
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@@ -2,6 +2,13 @@
 
 
 class MultiTensorApply(object):
+    """
+    Apply an operation to a list of tensors efficiently
+
+    :param chunk_size: size of a chunk
+    :type chunk_size: int
+    """
+
     available = False
     warned = False
 
diff --git a/colossalai/utils/timer.py b/colossalai/utils/timer.py
index bc0205344..1d121d5de 100644
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -74,6 +74,9 @@ class Timer:
 
 class MultiTimer:
     '''An object contains multiple timers
+
+    :param on: whether the timer is enabled. Default is True
+    :type on: bool
     '''
 
     def __init__(self, on: bool = True):
diff --git a/colossalai/zero/__init__.py b/colossalai/zero/__init__.py
index 6b7619a1f..e464be5ce 100644
--- a/colossalai/zero/__init__.py
+++ b/colossalai/zero/__init__.py
@@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
                     optimizer: Optimizer,
                     level: int,
                     zero_config):
+    """
+    A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param level: optimizer level, can be 2 or 3
+    :type level: int
+    :param zero_config: configuration for zero
+    :type zero_config: dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
     assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
     if level == 2:
         if is_no_pp_or_last_stage():
diff --git a/configs/resnet/resnet50.py b/configs/resnet/resnet50.py
deleted file mode 100644
index d5ecbdfef..000000000
--- a/configs/resnet/resnet50.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import os
-
-IMG_SIZE = 224
-BATCH_SIZE = 256
-NUM_EPOCHS = 100
-
-model = dict(
-    type='VanillaResNet',
-    block_type='ResNetBottleneck',
-    layers=[3, 4, 6, 3],
-    num_cls=10
-)
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=os.environ['DATA'],
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=os.environ['DATA'],
-        train=False,
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-    )
-)
-
-parallelization = dict(
-    pipeline=1,
-    tensor=dict(size=1, mode=None),
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.01
-)
-
-loss = dict(
-    type='CrossEntropyLoss'
-)
-
-from colossalai.engine import AMP_TYPE
-
-fp16 = dict(
-    mode=AMP_TYPE.APEX,
-    opt_level='O2',
-)
diff --git a/configs/sample_config.py b/configs/sample_config.py
deleted file mode 100644
index b9768d2c1..000000000
--- a/configs/sample_config.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-NUM_EPOCH = int
-
-model = dict()
-train_data = dict()
-test_data = dict()
-optimizer = dict()
-loss = dict()
-
-fp16 = dict()
-zero = dict()
-
-gradient_handler = []
-parallel = dict()
-hooks = []
-
-cudnn_benchmark = True
-cudnn_deterministic = False
-
-logging = dict()
diff --git a/configs/vit/vit_2d.py b/configs/vit/vit_2d.py
deleted file mode 100644
index b771b583e..000000000
--- a/configs/vit/vit_2d.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from pathlib import Path
-
-BATCH_SIZE = 512
-IMG_SIZE = 32
-PATCH_SIZE = 4
-DIM = 512
-NUM_ATTENTION_HEADS = 2
-SUMMA_DIM = 2
-NUM_CLASSES = 10
-DEPTH = 6
-NUM_EPOCHS = 60
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        drop_last=True,
-        pin_memory=True,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        train=False,
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-    )
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.001,
-    weight_decay=0
-)
-
-loss = dict(
-    type='CrossEntropyLoss2D',
-)
-
-model = dict(
-    type='VisionTransformerFromConfig',
-    tensor_splitting_cfg=dict(
-        type='ViTInputSplitter2D',
-    ),
-    embedding_cfg=dict(
-        type='ViTPatchEmbedding2D',
-        img_size=IMG_SIZE,
-        patch_size=PATCH_SIZE,
-        embed_dim=DIM,
-    ),
-    token_fusion_cfg=dict(
-        type='ViTTokenFuser2D',
-        img_size=IMG_SIZE,
-        patch_size=PATCH_SIZE,
-        embed_dim=DIM,
-        drop_rate=0.1
-    ),
-    norm_cfg=dict(
-        type='LayerNorm2D',
-        normalized_shape=DIM,
-        eps=1e-6,
-    ),
-    block_cfg=dict(
-        type='ViTBlock',
-        attention_cfg=dict(
-            type='ViTSelfAttention2D',
-            hidden_size=DIM,
-            num_attention_heads=NUM_ATTENTION_HEADS,
-            attention_dropout_prob=0.,
-            hidden_dropout_prob=0.1,
-            checkpoint=True
-        ),
-        droppath_cfg=dict(
-            type='VanillaViTDropPath',
-        ),
-        mlp_cfg=dict(
-            type='ViTMLP2D',
-            in_features=DIM,
-            dropout_prob=0.1,
-            mlp_ratio=4,
-            checkpoint=True
-        ),
-        norm_cfg=dict(
-            type='LayerNorm2D',
-            normalized_shape=DIM,
-            eps=1e-6,
-        ),
-    ),
-    head_cfg=dict(
-        type='ViTHead2D',
-        hidden_size=DIM,
-        num_classes=NUM_CLASSES,
-    ),
-    embed_dim=DIM,
-    depth=DEPTH,
-    drop_path_rate=0.,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='Accuracy2DHook'),
-    dict(type='LossHook'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=5
-        )
-    ),
-    # dict(type='TensorboardHook', log_dir='./tb_logs'),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
-]
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=4, mode='2d'),
-)
-
-# for fp16 training
-# from colossalai.engine import AMP_TYPE
-# fp16 = dict(
-#     mode=AMP_TYPE.PARALLEL,
-#     initial_scale=2 ** 8
-# )
-
-# only needed when pipeline parallel is used
-# schedule = dict(
-#     num_microbatches=8
-# )
-
-
-logging = dict(
-    root_path='./logs'
-)
diff --git a/configs/vit/vit_3d.py b/configs/vit/vit_3d.py
deleted file mode 100644
index ea605dac8..000000000
--- a/configs/vit/vit_3d.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from pathlib import Path
-
-from colossalai.context import ParallelMode
-from colossalai.engine import AMP_TYPE
-
-try:
-    import model_zoo
-except:
-    print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
-
-BATCH_SIZE = 512
-IMG_SIZE = 32
-NUM_EPOCHS = 60
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        transform_pipeline=[
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        num_workers=2,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        train=False,
-        transform_pipeline=[
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        num_workers=2,
-    )
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.001
-)
-
-loss = dict(
-    type='CrossEntropyLoss3D',
-    input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
-    weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
-)
-
-model = dict(
-    type='vit_tiny_3d_patch4_32',
-    drop_rate=0.1,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='LogTimingByEpochHook'),
-    dict(type='LogMemoryByEpochHook'),
-    dict(
-        type='Accuracy3DHook',
-        input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
-        weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
-    ),
-    dict(type='LossHook'),
-    dict(type='TensorboardHook', log_dir='./tfb_logs'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=5
-        )
-    ),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
-]
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=8, mode='3d'),
-)
-
-fp16 = dict(
-    mode=AMP_TYPE.PARALLEL,
-    initial_scale=2 ** 8
-)
-
-logging = dict(
-    root_path='./logs'
-)
diff --git a/docs/amp.md b/docs/amp.md
index 1dde09579..8072849f5 100644
--- a/docs/amp.md
+++ b/docs/amp.md
@@ -77,10 +77,10 @@ fp16 = dict(
 )
 ```
 
-## Tensor Parallel AMP
+## Naive AMP
 
 We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor 
-and pipeline parallelism.
+and pipeline parallelism. This AMP mode will cast all operations into fp16.
 
 The following conde block show a config file for this mode.
 
diff --git a/docs/colossalai/colossalai.amp.apex_amp.rst b/docs/colossalai/colossalai.amp.apex_amp.rst
new file mode 100644
index 000000000..c3ed5420c
--- /dev/null
+++ b/docs/colossalai/colossalai.amp.apex_amp.rst
@@ -0,0 +1,5 @@
+colossalai.amp.apex\_amp
+==========================
+
+.. automodule:: colossalai.amp.apex_amp
+   :members:
diff --git a/docs/colossalai/colossalai.amp.naive_amp.rst b/docs/colossalai/colossalai.amp.naive_amp.rst
new file mode 100644
index 000000000..0bf2795bf
--- /dev/null
+++ b/docs/colossalai/colossalai.amp.naive_amp.rst
@@ -0,0 +1,5 @@
+colossalai.amp.naive\_amp
+==========================
+
+.. automodule:: colossalai.amp.naive_amp
+   :members:
diff --git a/docs/colossalai/colossalai.amp.rst b/docs/colossalai/colossalai.amp.rst
new file mode 100644
index 000000000..0c7f22d6c
--- /dev/null
+++ b/docs/colossalai/colossalai.amp.rst
@@ -0,0 +1,13 @@
+colossalai.amp
+==================
+
+.. toctree::
+   :maxdepth: 2
+
+   colossalai.amp.torch_amp
+   colossalai.amp.apex_amp
+   colossalai.amp.naive_amp
+
+
+.. automodule:: colossalai.amp
+   :members:
diff --git a/docs/colossalai/colossalai.amp.torch_amp.rst b/docs/colossalai/colossalai.amp.torch_amp.rst
new file mode 100644
index 000000000..d71ff3c0d
--- /dev/null
+++ b/docs/colossalai/colossalai.amp.torch_amp.rst
@@ -0,0 +1,5 @@
+colossalai.amp.torch\_amp
+==========================
+
+.. automodule:: colossalai.amp.torch_amp
+      :members:
diff --git a/docs/colossalai/colossalai.builder.rst b/docs/colossalai/colossalai.builder.rst
index 60b8501c8..d2b96604c 100644
--- a/docs/colossalai/colossalai.builder.rst
+++ b/docs/colossalai/colossalai.builder.rst
@@ -1,12 +1,12 @@
 colossalai.builder
 ==================
 
-.. automodule:: colossalai.builder
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.builder.builder
    colossalai.builder.pipeline
+
+
+.. automodule:: colossalai.builder
+   :members:
diff --git a/docs/colossalai/colossalai.checkpointing.rst b/docs/colossalai/colossalai.checkpointing.rst
deleted file mode 100644
index 7db9af190..000000000
--- a/docs/colossalai/colossalai.checkpointing.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.checkpointing
-========================
-
-.. automodule:: colossalai.checkpointing
-   :members:
diff --git a/docs/colossalai/colossalai.communication.rst b/docs/colossalai/colossalai.communication.rst
index 5086fa663..05ad0d4d7 100644
--- a/docs/colossalai/colossalai.communication.rst
+++ b/docs/colossalai/colossalai.communication.rst
@@ -1,10 +1,6 @@
 colossalai.communication
 ========================
 
-.. automodule:: colossalai.communication
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
@@ -12,3 +8,7 @@ colossalai.communication
    colossalai.communication.p2p
    colossalai.communication.ring
    colossalai.communication.utils
+
+
+.. automodule:: colossalai.communication
+   :members:
diff --git a/docs/colossalai/colossalai.context.random.rst b/docs/colossalai/colossalai.context.random.rst
index 8d4b9c56a..58ed5b269 100644
--- a/docs/colossalai/colossalai.context.random.rst
+++ b/docs/colossalai/colossalai.context.random.rst
@@ -1,11 +1,11 @@
 colossalai.context.random
 =========================
 
-.. automodule:: colossalai.context.random
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.context.random.seed_manager
+
+
+.. automodule:: colossalai.context.random
+   :members:
diff --git a/docs/colossalai/colossalai.context.rst b/docs/colossalai/colossalai.context.rst
index babab5099..4ff29ce3d 100644
--- a/docs/colossalai/colossalai.context.rst
+++ b/docs/colossalai/colossalai.context.rst
@@ -1,9 +1,6 @@
 colossalai.context
 ==================
 
-.. automodule:: colossalai.context
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
@@ -17,3 +14,7 @@ colossalai.context
    colossalai.context.config
    colossalai.context.parallel_context
    colossalai.context.parallel_mode
+
+
+.. automodule:: colossalai.context
+   :members:
diff --git a/docs/colossalai/colossalai.engine.amp.amp_type.rst b/docs/colossalai/colossalai.engine.amp.amp_type.rst
deleted file mode 100644
index ec1afdfa6..000000000
--- a/docs/colossalai/colossalai.engine.amp.amp_type.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.engine.amp.amp\_type
-===============================
-
-.. automodule:: colossalai.engine.amp.amp_type
-   :members:
diff --git a/docs/colossalai/colossalai.engine.amp.grad_scaler.rst b/docs/colossalai/colossalai.engine.amp.grad_scaler.rst
deleted file mode 100644
index 752079eab..000000000
--- a/docs/colossalai/colossalai.engine.amp.grad_scaler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.engine.amp.grad\_scaler
-==================================
-
-.. automodule:: colossalai.engine.amp.grad_scaler
-   :members:
diff --git a/docs/colossalai/colossalai.engine.amp.rst b/docs/colossalai/colossalai.engine.amp.rst
deleted file mode 100644
index 987f27f6a..000000000
--- a/docs/colossalai/colossalai.engine.amp.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.engine.amp
-=====================
-
-.. automodule:: colossalai.engine.amp
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.engine.amp.amp_type
-   colossalai.engine.amp.grad_scaler
diff --git a/docs/colossalai/colossalai.engine.rst b/docs/colossalai/colossalai.engine.rst
index 915be4c98..5b37fb842 100644
--- a/docs/colossalai/colossalai.engine.rst
+++ b/docs/colossalai/colossalai.engine.rst
@@ -1,12 +1,12 @@
 colossalai.engine
 =================
 
-.. automodule:: colossalai.engine
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
-   colossalai.engine.amp
    colossalai.engine.gradient_handler
    colossalai.engine.schedule
+
+
+.. automodule:: colossalai.engine
+   :members:
diff --git a/docs/colossalai/colossalai.logging.rst b/docs/colossalai/colossalai.logging.rst
index a7a5cec72..71bcdd16b 100644
--- a/docs/colossalai/colossalai.logging.rst
+++ b/docs/colossalai/colossalai.logging.rst
@@ -1,11 +1,11 @@
 colossalai.logging
 ==================
 
-.. automodule:: colossalai.logging
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.logging.logging
+
+
+.. automodule:: colossalai.logging
+   :members:
diff --git a/docs/colossalai/colossalai.nn.data.base_dataset.rst b/docs/colossalai/colossalai.nn.data.base_dataset.rst
deleted file mode 100644
index 40e5e6b03..000000000
--- a/docs/colossalai/colossalai.nn.data.base_dataset.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.data.base\_dataset
-================================
-
-.. automodule:: colossalai.nn.data.base_dataset
-   :members:
diff --git a/docs/colossalai/colossalai.nn.data.caltech101_dataset.rst b/docs/colossalai/colossalai.nn.data.caltech101_dataset.rst
deleted file mode 100644
index ed36b049f..000000000
--- a/docs/colossalai/colossalai.nn.data.caltech101_dataset.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.data.caltech101\_dataset
-======================================
-
-.. automodule:: colossalai.nn.data.caltech101_dataset
-   :members:
diff --git a/docs/colossalai/colossalai.nn.data.cifar10_dataset.rst b/docs/colossalai/colossalai.nn.data.cifar10_dataset.rst
deleted file mode 100644
index efa8068a4..000000000
--- a/docs/colossalai/colossalai.nn.data.cifar10_dataset.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.data.cifar10\_dataset
-===================================
-
-.. automodule:: colossalai.nn.data.cifar10_dataset
-   :members:
diff --git a/docs/colossalai/colossalai.nn.data.rst b/docs/colossalai/colossalai.nn.data.rst
deleted file mode 100644
index 0f1ac9e75..000000000
--- a/docs/colossalai/colossalai.nn.data.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-colossalai.nn.data
-==================
-
-.. automodule:: colossalai.nn.data
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.data.sampler
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.data.base_dataset
-   colossalai.nn.data.caltech101_dataset
-   colossalai.nn.data.cifar10_dataset
diff --git a/docs/colossalai/colossalai.nn.data.sampler.base_sampler.rst b/docs/colossalai/colossalai.nn.data.sampler.base_sampler.rst
deleted file mode 100644
index 7fc47a57f..000000000
--- a/docs/colossalai/colossalai.nn.data.sampler.base_sampler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.data.sampler.base\_sampler
-========================================
-
-.. automodule:: colossalai.nn.data.sampler.base_sampler
-   :members:
diff --git a/docs/colossalai/colossalai.nn.data.sampler.data_parallel_sampler.rst b/docs/colossalai/colossalai.nn.data.sampler.data_parallel_sampler.rst
deleted file mode 100644
index 0c3db3423..000000000
--- a/docs/colossalai/colossalai.nn.data.sampler.data_parallel_sampler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.data.sampler.data\_parallel\_sampler
-==================================================
-
-.. automodule:: colossalai.nn.data.sampler.data_parallel_sampler
-   :members:
diff --git a/docs/colossalai/colossalai.nn.data.sampler.rst b/docs/colossalai/colossalai.nn.data.sampler.rst
deleted file mode 100644
index e8f8ee036..000000000
--- a/docs/colossalai/colossalai.nn.data.sampler.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-colossalai.nn.data.sampler
-==========================
-
-.. automodule:: colossalai.nn.data.sampler
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.data.sampler.base_sampler
-   colossalai.nn.data.sampler.data_parallel_sampler
diff --git a/docs/colossalai/colossalai.nn.layer.non_parallel_layers.rst b/docs/colossalai/colossalai.nn.layer.non_parallel_layers.rst
new file mode 100644
index 000000000..8103d92b8
--- /dev/null
+++ b/docs/colossalai/colossalai.nn.layer.non_parallel_layers.rst
@@ -0,0 +1,5 @@
+colossalai.nn.layer.non\_parallel\_layers
+======================================
+
+.. automodule:: colossalai.nn.layer.non_parallel_layers
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_1d.rst b/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
index 3a8ed6206..a765b04ad 100644
--- a/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
@@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_1d
 ================================
 
-.. automodule:: colossalai.nn.layer.parallel_1d
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.nn.layer.parallel_1d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_1d
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_2d.rst b/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
index f5ad41a1b..d72fef9a9 100644
--- a/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
@@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_2d
 ================================
 
-.. automodule:: colossalai.nn.layer.parallel_2d
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.nn.layer.parallel_2d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_2d
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst b/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
index 5869bdee9..4ba8b1348 100644
--- a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
@@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_2p5d
 ==================================
 
-.. automodule:: colossalai.nn.layer.parallel_2p5d
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.nn.layer.parallel_2p5d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_2p5d
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_3d.rst b/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
index bb55a63e5..d0e82c838 100644
--- a/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
@@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_3d
 ================================
 
-.. automodule:: colossalai.nn.layer.parallel_3d
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.nn.layer.parallel_3d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_3d
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst b/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
index 24e8941d4..dfea23ab3 100644
--- a/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
@@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_sequence
 ======================================
 
-.. automodule:: colossalai.nn.layer.parallel_sequence
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.nn.layer.parallel_sequence.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_sequence
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.layers.rst b/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.layers.rst
deleted file mode 100644
index 93798dc91..000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_vision\_transformer.layers
-========================================================
-
-.. automodule:: colossalai.nn.layer.parallel_vision_transformer.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.rst b/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.rst
deleted file mode 100644
index 7c96aa19b..000000000
--- a/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_vision\_transformer
-=================================================
-
-.. automodule:: colossalai.nn.layer.parallel_vision_transformer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_vision_transformer.layers
diff --git a/docs/colossalai/colossalai.nn.layer.rst b/docs/colossalai/colossalai.nn.layer.rst
index 5746a2d72..1538c3f02 100644
--- a/docs/colossalai/colossalai.nn.layer.rst
+++ b/docs/colossalai/colossalai.nn.layer.rst
@@ -1,9 +1,6 @@
 colossalai.nn.layer
 ===================
 
-.. automodule:: colossalai.nn.layer
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
@@ -12,13 +9,10 @@ colossalai.nn.layer
    colossalai.nn.layer.parallel_2p5d
    colossalai.nn.layer.parallel_3d
    colossalai.nn.layer.parallel_sequence
-   colossalai.nn.layer.parallel_vision_transformer
-   colossalai.nn.layer.vanilla_resnet
-   colossalai.nn.layer.vanilla_vision_transformer
+   colossalai.nn.layer.non_parallel_layers
    colossalai.nn.layer.wrapper
-
-
-.. toctree::
-   :maxdepth: 2
-
    colossalai.nn.layer.base_layer
+
+
+.. automodule:: colossalai.nn.layer
+   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.basic_block.rst b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.basic_block.rst
deleted file mode 100644
index f4bad38f7..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.basic_block.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.basic\_block
-================================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.basic_block
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.bottleneck.rst b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.bottleneck.rst
deleted file mode 100644
index 31213e14c..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.bottleneck.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.bottleneck
-==============================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.bottleneck
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.conv.rst b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.conv.rst
deleted file mode 100644
index 82fb1571b..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.conv.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.conv
-========================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.conv
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.reslayer.rst b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.reslayer.rst
deleted file mode 100644
index 74715cd24..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.reslayer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.reslayer
-============================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.reslayer
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.rst b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.rst
deleted file mode 100644
index 6c98cd3a7..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet
-===================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.vanilla_resnet.basic_block
-   colossalai.nn.layer.vanilla_resnet.bottleneck
-   colossalai.nn.layer.vanilla_resnet.conv
-   colossalai.nn.layer.vanilla_resnet.reslayer
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.layers.rst b/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.layers.rst
deleted file mode 100644
index e58155c62..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.layers.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_vision\_transformer.layers
-=======================================================
-
-.. automodule:: colossalai.nn.layer.vanilla_vision_transformer.layers
-   :members:
diff --git a/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.rst b/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.rst
deleted file mode 100644
index 5164b03f6..000000000
--- a/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.layer.vanilla\_vision\_transformer
-================================================
-
-.. automodule:: colossalai.nn.layer.vanilla_vision_transformer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.vanilla_vision_transformer.layers
diff --git a/docs/colossalai/colossalai.nn.loss.base_loss.rst b/docs/colossalai/colossalai.nn.loss.base_loss.rst
deleted file mode 100644
index 0396ac1b5..000000000
--- a/docs/colossalai/colossalai.nn.loss.base_loss.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.base\_loss
-=============================
-
-.. automodule:: colossalai.nn.loss.base_loss
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.cross_entropy_1d.rst b/docs/colossalai/colossalai.nn.loss.cross_entropy_1d.rst
deleted file mode 100644
index aeb21ae3b..000000000
--- a/docs/colossalai/colossalai.nn.loss.cross_entropy_1d.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.loss.cross\_entropy\_1d
-=====================================
-
-.. automodule:: colossalai.nn.loss.cross_entropy_1d
-   :members:
diff --git a/docs/colossalai/colossalai.nn.loss.rst b/docs/colossalai/colossalai.nn.loss.rst
index face8dbef..8efd847d6 100644
--- a/docs/colossalai/colossalai.nn.loss.rst
+++ b/docs/colossalai/colossalai.nn.loss.rst
@@ -1,15 +1,13 @@
 colossalai.nn.loss
 ==================
 
-.. automodule:: colossalai.nn.loss
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
-   colossalai.nn.loss.base_loss
-   colossalai.nn.loss.cross_entropy_1d
    colossalai.nn.loss.cross_entropy_2d
    colossalai.nn.loss.cross_entropy_2p5d
    colossalai.nn.loss.cross_entropy_3d
+
+
+.. automodule:: colossalai.nn.loss
+   :members:
diff --git a/docs/colossalai/colossalai.nn.lr_scheduler.rst b/docs/colossalai/colossalai.nn.lr_scheduler.rst
index 427a3ee45..f32eb3be4 100644
--- a/docs/colossalai/colossalai.nn.lr_scheduler.rst
+++ b/docs/colossalai/colossalai.nn.lr_scheduler.rst
@@ -1,10 +1,6 @@
 colossalai.nn.lr\_scheduler
 ===========================
 
-.. automodule:: colossalai.nn.lr_scheduler
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
@@ -15,3 +11,7 @@ colossalai.nn.lr\_scheduler
    colossalai.nn.lr_scheduler.onecycle
    colossalai.nn.lr_scheduler.poly
    colossalai.nn.lr_scheduler.torch
+
+   
+.. automodule:: colossalai.nn.lr_scheduler
+   :members:
diff --git a/docs/colossalai/colossalai.nn.model.base_model.rst b/docs/colossalai/colossalai.nn.model.base_model.rst
deleted file mode 100644
index aac96be7b..000000000
--- a/docs/colossalai/colossalai.nn.model.base_model.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.model.base\_model
-===============================
-
-.. automodule:: colossalai.nn.model.base_model
-   :members:
diff --git a/docs/colossalai/colossalai.nn.model.model_from_config.rst b/docs/colossalai/colossalai.nn.model.model_from_config.rst
new file mode 100644
index 000000000..cea8ff4f4
--- /dev/null
+++ b/docs/colossalai/colossalai.nn.model.model_from_config.rst
@@ -0,0 +1,5 @@
+colossalai.nn.model.model\_from\_config
+===============================
+
+.. automodule:: colossalai.nn.model.model_from_config
+   :members:
diff --git a/docs/colossalai/colossalai.nn.model.rst b/docs/colossalai/colossalai.nn.model.rst
index b83d174a7..88fc55e06 100644
--- a/docs/colossalai/colossalai.nn.model.rst
+++ b/docs/colossalai/colossalai.nn.model.rst
@@ -1,17 +1,7 @@
 colossalai.nn.model
 ===================
 
-.. automodule:: colossalai.nn.model
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
-   colossalai.nn.model.vanilla_resnet
-   colossalai.nn.model.vision_transformer
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.model.base_model
+   colossalai.nn.model.model_from_config
diff --git a/docs/colossalai/colossalai.nn.model.vanilla_resnet.resnet.rst b/docs/colossalai/colossalai.nn.model.vanilla_resnet.resnet.rst
deleted file mode 100644
index a2dd49ae3..000000000
--- a/docs/colossalai/colossalai.nn.model.vanilla_resnet.resnet.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.model.vanilla\_resnet.resnet
-==========================================
-
-.. automodule:: colossalai.nn.model.vanilla_resnet.resnet
-   :members:
diff --git a/docs/colossalai/colossalai.nn.model.vanilla_resnet.rst b/docs/colossalai/colossalai.nn.model.vanilla_resnet.rst
deleted file mode 100644
index 148ce723d..000000000
--- a/docs/colossalai/colossalai.nn.model.vanilla_resnet.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.model.vanilla\_resnet
-===================================
-
-.. automodule:: colossalai.nn.model.vanilla_resnet
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.model.vanilla_resnet.resnet
diff --git a/docs/colossalai/colossalai.nn.model.vision_transformer.rst b/docs/colossalai/colossalai.nn.model.vision_transformer.rst
deleted file mode 100644
index edfd07dfa..000000000
--- a/docs/colossalai/colossalai.nn.model.vision_transformer.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.model.vision\_transformer
-=======================================
-
-.. automodule:: colossalai.nn.model.vision_transformer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.model.vision_transformer.vision_transformer
diff --git a/docs/colossalai/colossalai.nn.model.vision_transformer.vision_transformer.rst b/docs/colossalai/colossalai.nn.model.vision_transformer.vision_transformer.rst
deleted file mode 100644
index 08e6a96ef..000000000
--- a/docs/colossalai/colossalai.nn.model.vision_transformer.vision_transformer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.model.vision\_transformer.vision\_transformer
-===========================================================
-
-.. automodule:: colossalai.nn.model.vision_transformer.vision_transformer
-   :members:
diff --git a/docs/colossalai/colossalai.nn.multi_tensor_apply.multi_tensor_apply.rst b/docs/colossalai/colossalai.nn.multi_tensor_apply.multi_tensor_apply.rst
deleted file mode 100644
index 812a4d7eb..000000000
--- a/docs/colossalai/colossalai.nn.multi_tensor_apply.multi_tensor_apply.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.multi\_tensor\_apply.multi\_tensor\_apply
-=======================================================
-
-.. automodule:: colossalai.nn.multi_tensor_apply.multi_tensor_apply
-   :members:
diff --git a/docs/colossalai/colossalai.nn.multi_tensor_apply.rst b/docs/colossalai/colossalai.nn.multi_tensor_apply.rst
deleted file mode 100644
index f1ae7c144..000000000
--- a/docs/colossalai/colossalai.nn.multi_tensor_apply.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-colossalai.nn.multi\_tensor\_apply
-==================================
-
-.. automodule:: colossalai.nn.multi_tensor_apply
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.multi_tensor_apply.multi_tensor_apply
diff --git a/docs/colossalai/colossalai.nn.optimizer.fp16_optimizer.rst b/docs/colossalai/colossalai.nn.optimizer.fp16_optimizer.rst
deleted file mode 100644
index 977bd817d..000000000
--- a/docs/colossalai/colossalai.nn.optimizer.fp16_optimizer.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.fp16\_optimizer
-=======================================
-
-.. automodule:: colossalai.nn.optimizer.fp16_optimizer
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.loss_scaler.rst b/docs/colossalai/colossalai.nn.optimizer.loss_scaler.rst
deleted file mode 100644
index 5ee8b9650..000000000
--- a/docs/colossalai/colossalai.nn.optimizer.loss_scaler.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.loss\_scaler
-====================================
-
-.. automodule:: colossalai.nn.optimizer.loss_scaler
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.rst b/docs/colossalai/colossalai.nn.optimizer.rst
index 2b0748534..f865b91f4 100644
--- a/docs/colossalai/colossalai.nn.optimizer.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.rst
@@ -1,20 +1,15 @@
 colossalai.nn.optimizer
 =======================
 
-.. automodule:: colossalai.nn.optimizer
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
-   colossalai.nn.optimizer.fp16_optimizer
    colossalai.nn.optimizer.fused_adam
    colossalai.nn.optimizer.fused_lamb
    colossalai.nn.optimizer.fused_sgd
    colossalai.nn.optimizer.lamb
    colossalai.nn.optimizer.lars
-   colossalai.nn.optimizer.loss_scaler
-   colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
-   colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
-   colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
+
+
+.. automodule:: colossalai.nn.optimizer
+   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_1.rst b/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_1.rst
deleted file mode 100644
index 04f2e3f96..000000000
--- a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_1.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_1
-=============================================================
-
-.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_2.rst b/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_2.rst
deleted file mode 100644
index b425f4305..000000000
--- a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_2.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_2
-=============================================================
-
-.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
-   :members:
diff --git a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_3.rst b/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_3.rst
deleted file mode 100644
index 798231e0b..000000000
--- a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_3.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_3
-=============================================================
-
-.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
-   :members:
diff --git a/docs/colossalai/colossalai.nn.rst b/docs/colossalai/colossalai.nn.rst
index 4d4593312..bf83f33f4 100644
--- a/docs/colossalai/colossalai.nn.rst
+++ b/docs/colossalai/colossalai.nn.rst
@@ -1,16 +1,15 @@
 colossalai.nn
 =============
 
-.. automodule:: colossalai.nn
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
-   colossalai.nn.data
    colossalai.nn.layer
    colossalai.nn.loss
    colossalai.nn.lr_scheduler
    colossalai.nn.model
-   colossalai.nn.multi_tensor_apply
    colossalai.nn.optimizer
+
+
+.. automodule:: colossalai.nn
+   :members:
diff --git a/docs/colossalai/colossalai.registry.rst b/docs/colossalai/colossalai.registry.rst
index 0f294f6d1..2991f04b1 100644
--- a/docs/colossalai/colossalai.registry.rst
+++ b/docs/colossalai/colossalai.registry.rst
@@ -1,11 +1,11 @@
 colossalai.registry
 ===================
 
-.. automodule:: colossalai.registry
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
    colossalai.registry.registry
+
+
+.. automodule:: colossalai.registry
+   :members:
diff --git a/docs/colossalai/colossalai.rst b/docs/colossalai/colossalai.rst
index a4d4656fd..eace3075b 100644
--- a/docs/colossalai/colossalai.rst
+++ b/docs/colossalai/colossalai.rst
@@ -1,12 +1,18 @@
 colossalai
 ==========
 
-.. automodule:: colossalai
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
+   colossalai.constants
+   colossalai.core
+   colossalai.initialize
+
+
+.. toctree::
+   :maxdepth: 2
+
+   colossalai.amp
    colossalai.builder
    colossalai.communication
    colossalai.context
@@ -16,11 +22,7 @@ colossalai
    colossalai.registry
    colossalai.trainer
    colossalai.utils
+   colossalai.zero
 
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.constants
-   colossalai.core
-   colossalai.initialize
+.. automodule:: colossalai
+   :members:
diff --git a/docs/colossalai/colossalai.trainer.rst b/docs/colossalai/colossalai.trainer.rst
index b2ccefd3e..44bdc06cf 100644
--- a/docs/colossalai/colossalai.trainer.rst
+++ b/docs/colossalai/colossalai.trainer.rst
@@ -1,9 +1,6 @@
 colossalai.trainer
 ==================
 
-.. automodule:: colossalai.trainer
-   :members:
-
 .. toctree::
    :maxdepth: 2
 
@@ -14,3 +11,7 @@ colossalai.trainer
    :maxdepth: 2
 
    colossalai.trainer.metric
+
+
+.. automodule:: colossalai.trainer
+   :members:
diff --git a/docs/colossalai/colossalai.utils.data_sampler.rst b/docs/colossalai/colossalai.utils.data_sampler.rst
new file mode 100644
index 000000000..96eac582c
--- /dev/null
+++ b/docs/colossalai/colossalai.utils.data_sampler.rst
@@ -0,0 +1,5 @@
+colossalai.utils.data\_sampler
+=======================================
+
+.. automodule:: colossalai.utils.data_sampler
+   :members:
diff --git a/docs/colossalai/colossalai.utils.gradient_accumulation.rst b/docs/colossalai/colossalai.utils.gradient_accumulation.rst
new file mode 100644
index 000000000..6ad2ca3ae
--- /dev/null
+++ b/docs/colossalai/colossalai.utils.gradient_accumulation.rst
@@ -0,0 +1,5 @@
+colossalai.utils.gradient\_accumulation
+=======================================
+
+.. automodule:: colossalai.utils.gradient_accumulation
+   :members:
diff --git a/docs/colossalai/colossalai.utils.multi_tensor_apply.rst b/docs/colossalai/colossalai.utils.multi_tensor_apply.rst
new file mode 100644
index 000000000..495b4fa6a
--- /dev/null
+++ b/docs/colossalai/colossalai.utils.multi_tensor_apply.rst
@@ -0,0 +1,8 @@
+colossalai.nn.multi\_tensor\_apply
+==================================
+
+.. automodule:: colossalai.utils.multi_tensor_apply.multi_tensor_apply
+   :members:
+
+
+
diff --git a/docs/colossalai/colossalai.utils.rst b/docs/colossalai/colossalai.utils.rst
index 7f712e313..998c31bbb 100644
--- a/docs/colossalai/colossalai.utils.rst
+++ b/docs/colossalai/colossalai.utils.rst
@@ -1,10 +1,6 @@
 colossalai.utils
 ================
 
-.. automodule:: colossalai.utils
-   :members:
-
-
 .. toctree::
    :maxdepth: 2
 
@@ -12,5 +8,12 @@ colossalai.utils
    colossalai.utils.checkpointing
    colossalai.utils.common
    colossalai.utils.cuda
+   colossalai.utils.data_sampler
+   colossalai.utils.gradient_accumulation
    colossalai.utils.memory
+   colossalai.utils.multi_tensor_apply
    colossalai.utils.timer
+
+
+.. automodule:: colossalai.utils
+   :members:
diff --git a/docs/colossalai/colossalai.zero.rst b/docs/colossalai/colossalai.zero.rst
new file mode 100644
index 000000000..bbd085d51
--- /dev/null
+++ b/docs/colossalai/colossalai.zero.rst
@@ -0,0 +1,5 @@
+colossalai.zero
+================
+
+.. automodule:: colossalai.zero
+   :members:
diff --git a/docs/config.md b/docs/config.md
index eab4a42e7..72c508d63 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -18,6 +18,15 @@ fp16 = dict(
     initial_scale=2 ** 8
 )
 
+# optional
+# configuration for zero
+# you can refer to the Zero Redundancy optimizer and zero offload section for details
+# https://www.colossalai.org/zero.html
+zero = dict(
+    level=<int>,
+    ...
+)
+
 # optional
 # if you are using complex gradient handling
 # otherwise, you do not need this in your config file
diff --git a/docs/installation.md b/docs/installation.md
index 0ce1de8de..50858d05c 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,15 +1,17 @@
 # Setup
 
-## Install with pip
+### PyPI
 
 ```bash
 pip install colossalai
 ```
 
-## Install from source
+### Install From Source (Recommended)
+
+> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)
 
 ```shell
-git clone git@github.com:hpcaitech/ColossalAI.git
+git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 # install dependency
 pip install -r requirements/requirements.txt
@@ -22,8 +24,4 @@ Install and enable CUDA kernel fusion (compulsory installation when using fused
 
 ```shell
 pip install -v --no-cache-dir --global-option="--cuda_ext" .
-
-# install with editable enabled
-pip install -v --no-cache-dir --global-option="--cuda_ext" -e .
 ```
-
diff --git a/docs/run_demo.md b/docs/run_demo.md
index 2b0c4bdf3..60d7eebf5 100644
--- a/docs/run_demo.md
+++ b/docs/run_demo.md
@@ -7,51 +7,92 @@ can also run on systems with only one GPU. Quick demos showing how to use Coloss
 ## Single GPU
 
 Colossal-AI can be used to train deep learning models on systems with only one GPU and achieve baseline
-performances. [Here](https://colab.research.google.com/drive/1fJnqqFzPuzZ_kn1lwCpG2nh3l2ths0KE?usp=sharing#scrollTo=cQ_y7lBG09LS)
-is an example showing how to train a LeNet model on the CIFAR10 dataset using Colossal-AI.
+performances. We provided an example to train ResNet on CIFAR10 data with only one GPU. You can find this example in 
+`examples\resnet_cifar10_data_parallel` in the repository. Detailed instructions can be found in its `README.md`.
 
 ## Multiple GPUs
 
 Colossal-AI can be used to train deep learning models on distributed systems with multiple GPUs and accelerate the
 training process drastically by applying efficient parallelization techiniques, which will be elaborated in
-the [Parallelization](parallelization.md) section below. Run the code below on your distributed system with 4 GPUs,
-where `HOST` is the IP address of your system. Note that we use
-the [Slurm](https://slurm.schedmd.com/documentation.html) job scheduling system here.
+the [Parallelization](parallelization.md) section below. 
 
-```bash
-HOST=xxx.xxx.xxx.xxx srun ./scripts/slurm_dist_train.sh ./examples/run_trainer.py ./configs/vit/vit_2d.py
-```
+You can turn the resnet example mentioned above into a multi-GPU training by setting `--nproc_per_node` to be the number of 
+GPUs you have on your system. We also provide an example of Vision Transformer which relies on
+training with more GPUs. You can visit this example in `examples\vit_b16_imagenet_data_parallel`. It has a detailed instructional 
+`README.md` for you too.
 
-`./configs/vit/vit_2d.py` is a config file, which is introduced in the [Config file](config.md) section below. These
-config files are used by Colossal-AI to define all kinds of training arguments, such as the model, dataset and training
-method (optimizer, lr_scheduler, epoch, etc.). Config files are highly customizable and can be modified so as to train
-different models.
-`./examples/run_trainer.py` contains a standard training script and is presented below, it reads the config file and
-realizes the training process.
+
+## Sample Training Script
+
+Below is a typical way of how you train the model using 
 
 ```python
 import colossalai
-from colossalai.core import global_context as gpc
+from colossalai.amp import AMP_TYPE
 from colossalai.logging import get_dist_logger
-from colossalai.trainer import Trainer
+from colossalai.trainer import Trainer, hooks
+from colossalai.utils import get_dataloader
 
 
+CONFIG = dict(
+    parallel=dict(
+        pipeline=1,
+        tensor=1, mode=None
+    ),
+    fp16 = dict(
+        mode=AMP_TYPE.TORCH
+    ),
+    gradient_accumulation=4,
+    clip_grad_norm=1.0
+)
+
 def run_trainer():
-    engine, train_dataloader, test_dataloader = colossalai.initialize()
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch(config=CONFIG,
+                      rank=args.rank,
+                      world_size=args.world_size,
+                      host=args.host,
+                      port=args.port,
+                      backend=args.backend)
+
     logger = get_dist_logger()
 
-    logger.info("engine is built", ranks=[0])
+    # instantiate your compoentns
+    model = MyModel()
+    optimizer = MyOptimizer(model.parameters(), ...)
+    train_dataset = TrainDataset()
+    test_dataset = TestDataset()
+    train_dataloader = get_dataloader(train_dataset, ...)
+    test_dataloader = get_dataloader(test_dataset, ...)
+    lr_scheduler = MyScheduler()
+    logger.info("components are built")
+
+    engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize(model, 
+                                                                                    optimizer, 
+                                                                                    criterion, 
+                                                                                    train_dataloader, 
+                                                                                    test_dataloader, 
+                                                                                    lr_scheduler)
 
     trainer = Trainer(engine=engine,
                       verbose=True)
-    logger.info("trainer is built", ranks=[0])
 
-    logger.info("start training", ranks=[0])
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
+        hooks.AccuracyHook(),
+        hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.LogMemoryByEpochHook(logger),
+        hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
+    ]
+
     trainer.fit(
         train_dataloader=train_dataloader,
         test_dataloader=test_dataloader,
-        epochs=gpc.config.num_epochs,
-        hooks_cfg=gpc.config.hooks,
+        epochs=NUM_EPOCH,
+        hooks=hook_list,
         display_progress=True,
         test_interval=2
     )
diff --git a/docs/zero.md b/docs/zero.md
index 201223803..d2a6c1658 100644
--- a/docs/zero.md
+++ b/docs/zero.md
@@ -19,6 +19,7 @@ Below are a few examples of ZeRO-3 configurations.
 
 ### Example of ZeRO-3 Configurations
 
+You can refer to the [DeepSpeed configuration](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) for details. 
 Here we use `Adam` as the initial optimizer.
 
 1. Use ZeRO to partition the optimizer states, gradients (level 2), and parameters (level 3).
diff --git a/examples/colossal_cifar_demo.ipynb b/examples/colossal_cifar_demo.ipynb
deleted file mode 100644
index 266fd2543..000000000
--- a/examples/colossal_cifar_demo.ipynb
+++ /dev/null
@@ -1,370 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "uhrbvVEh2iJd"
-   },
-   "source": [
-    "# Train an image classifier\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "vP7LvCpG23a2",
-    "outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n",
-      "Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n",
-      "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from deepspeed) (21.0)\n",
-      "Requirement already satisfied: triton in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.1.1)\n",
-      "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from deepspeed) (4.62.3)\n",
-      "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.19.5)\n",
-      "Requirement already satisfied: tensorboardX==1.8 in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.8)\n",
-      "Requirement already satisfied: ninja in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.10.2.2)\n",
-      "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from deepspeed) (1.9.0+cu111)\n",
-      "Requirement already satisfied: psutil in /usr/local/lib/python3.7/dist-packages (from deepspeed) (5.4.8)\n",
-      "Requirement already satisfied: protobuf>=3.2.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (3.17.3)\n",
-      "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from tensorboardX==1.8->deepspeed) (1.15.0)\n",
-      "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->deepspeed) (2.4.7)\n",
-      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->deepspeed) (3.7.4.3)\n",
-      "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install ColossalAI deepspeed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "UVKEurtS4SFS",
-    "outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Please install apex to use FP16 Optimizer\n",
-      "Apex should be installed to use the FP16 optimizer\n",
-      "apex is required for mixed precision training\n"
-     ]
-    }
-   ],
-   "source": [
-    "import colossalai\n",
-    "from colossalai.engine import Engine, NonPipelineSchedule\n",
-    "from colossalai.trainer import Trainer\n",
-    "from colossalai.context import Config\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "PpFfhNBD7NSn"
-   },
-   "source": [
-    "First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "8yF7Lc-K7NAS",
-    "outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,605 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,608 INFO: Added key: store_based_barrier_key:3 to store for rank: 0\n",
-      "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,610 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "process rank 0 is bound to device 0\n",
-      "initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n"
-     ]
-    }
-   ],
-   "source": [
-    "parallel_cfg = Config(dict(parallel=dict(\n",
-    "    data=dict(size=1),\n",
-    "    pipeline=dict(size=1),\n",
-    "    tensor=dict(size=1, mode=None),\n",
-    ")))\n",
-    "colossalai.init_dist(config=parallel_cfg,\n",
-    "          local_rank=0,\n",
-    "          world_size=1,\n",
-    "          host='127.0.0.1',\n",
-    "          port=8888,\n",
-    "          backend='nccl')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "ppjmMxc_81TK"
-   },
-   "source": [
-    "Load and normalize the CIFAR10 training and test datasets using `colossalai.nn.data`. Note that we have wrapped `torchvision.transforms`, so that we can simply use the config dict to use them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "ZyGhyD47-dUY",
-    "outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5"
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Files already downloaded and verified\n",
-      "Files already downloaded and verified\n"
-     ]
-    }
-   ],
-   "source": [
-    "transform_cfg = [\n",
-    "    dict(type='ToTensor'),\n",
-    "    dict(type='Normalize',\n",
-    "        mean=[0.4914, 0.4822, 0.4465],\n",
-    "        std=[0.2023, 0.1994, 0.2010]),\n",
-    "]\n",
-    "\n",
-    "batch_size = 128\n",
-    "\n",
-    "trainset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=True)\n",
-    "trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)\n",
-    "\n",
-    "testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n",
-    "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "NvPbfLLR9NzC"
-   },
-   "source": [
-    "We just define a simple Convolutional Neural Network here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "cQ_y7lBG09LS"
-   },
-   "outputs": [],
-   "source": [
-    "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "\n",
-    "class Net(nn.Module):\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
-    "        self.pool = nn.MaxPool2d(2, 2)\n",
-    "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
-    "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
-    "        self.fc2 = nn.Linear(120, 84)\n",
-    "        self.fc3 = nn.Linear(84, 10)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        x = self.pool(F.relu(self.conv1(x)))\n",
-    "        x = self.pool(F.relu(self.conv2(x)))\n",
-    "        x = torch.flatten(x, 1) # flatten all dimensions except batch\n",
-    "        x = F.relu(self.fc1(x))\n",
-    "        x = F.relu(self.fc2(x))\n",
-    "        x = self.fc3(x)\n",
-    "        return x\n",
-    "\n",
-    "\n",
-    "model = Net().cuda()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "tgsszAmM9dYZ"
-   },
-   "source": [
-    "Define a Loss function and optimizer. And then we use them to initialize `Engine` and `Trainer`. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "YtaDoCax1BCf",
-    "outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n",
-      "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n",
-      "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n",
-      "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch.optim as optim\n",
-    "\n",
-    "criterion = nn.CrossEntropyLoss()\n",
-    "optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n",
-    "schedule = NoPipelineSchedule()\n",
-    "engine = Engine(\n",
-    "        model=model,\n",
-    "        criterion=criterion,\n",
-    "        optimizer=optimizer,\n",
-    "        lr_scheduler=None,\n",
-    "        schedule=schedule\n",
-    "    )\n",
-    "trainer = Trainer(engine=engine,\n",
-    "          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n",
-    "          verbose=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "_JR2TuvH99Ik"
-   },
-   "source": [
-    "Then we set training configs. We train our model for 10 epochs and it will be evaluated every 1 epoch. Set `display_progress` to `True` to display the training / evaluating progress bar."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "w-J3IP-J1sfx",
-    "outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c"
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[Epoch 0 train]:   0%|          | 0/391 [00:00<?, ?it/s]/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
-      "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n",
-      "[Epoch 0 train]: 100%|██████████| 391/391 [00:14<00:00, 26.82it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:11,088 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 2.29158\n",
-      "[Epoch 0 val]: 100%|██████████| 79/79 [00:02<00:00, 28.66it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:14,040 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 2.26517, Accuracy = 0.14820\n",
-      "[Epoch 1 train]: 100%|██████████| 391/391 [00:14<00:00, 26.31it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:29,059 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 2.15763\n",
-      "[Epoch 1 val]: 100%|██████████| 79/79 [00:02<00:00, 28.50it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:32,007 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 2.00450, Accuracy = 0.27850\n",
-      "[Epoch 2 train]: 100%|██████████| 391/391 [00:14<00:00, 26.08it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:47,167 INFO: Training - Epoch 3 - LogMetricByEpochHook: Loss = 1.85409\n",
-      "[Epoch 2 val]: 100%|██████████| 79/79 [00:02<00:00, 27.89it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:28:50,168 INFO: Testing - Epoch 3 - LogMetricByEpochHook: Loss = 1.73788, Accuracy = 0.35990\n",
-      "[Epoch 3 train]: 100%|██████████| 391/391 [00:14<00:00, 26.09it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:05,330 INFO: Training - Epoch 4 - LogMetricByEpochHook: Loss = 1.69363\n",
-      "[Epoch 3 val]: 100%|██████████| 79/79 [00:02<00:00, 28.43it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:08,290 INFO: Testing - Epoch 4 - LogMetricByEpochHook: Loss = 1.65005, Accuracy = 0.39350\n",
-      "[Epoch 4 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:23,530 INFO: Training - Epoch 5 - LogMetricByEpochHook: Loss = 1.61387\n",
-      "[Epoch 4 val]: 100%|██████████| 79/79 [00:02<00:00, 27.75it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:26,515 INFO: Testing - Epoch 5 - LogMetricByEpochHook: Loss = 1.57507, Accuracy = 0.42430\n",
-      "[Epoch 5 train]: 100%|██████████| 391/391 [00:15<00:00, 25.92it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:41,764 INFO: Training - Epoch 6 - LogMetricByEpochHook: Loss = 1.55712\n",
-      "[Epoch 5 val]: 100%|██████████| 79/79 [00:02<00:00, 27.51it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:44,778 INFO: Testing - Epoch 6 - LogMetricByEpochHook: Loss = 1.53242, Accuracy = 0.43700\n",
-      "[Epoch 6 train]: 100%|██████████| 391/391 [00:14<00:00, 26.13it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:29:59,927 INFO: Training - Epoch 7 - LogMetricByEpochHook: Loss = 1.51618\n",
-      "[Epoch 6 val]: 100%|██████████| 79/79 [00:02<00:00, 28.31it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:02,884 INFO: Testing - Epoch 7 - LogMetricByEpochHook: Loss = 1.49720, Accuracy = 0.45430\n",
-      "[Epoch 7 train]: 100%|██████████| 391/391 [00:14<00:00, 26.23it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:17,968 INFO: Training - Epoch 8 - LogMetricByEpochHook: Loss = 1.47857\n",
-      "[Epoch 7 val]: 100%|██████████| 79/79 [00:02<00:00, 27.97it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:20,967 INFO: Testing - Epoch 8 - LogMetricByEpochHook: Loss = 1.45808, Accuracy = 0.46320\n",
-      "[Epoch 8 train]: 100%|██████████| 391/391 [00:14<00:00, 26.11it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:36,129 INFO: Training - Epoch 9 - LogMetricByEpochHook: Loss = 1.44656\n",
-      "[Epoch 8 val]: 100%|██████████| 79/79 [00:02<00:00, 28.18it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:39,096 INFO: Testing - Epoch 9 - LogMetricByEpochHook: Loss = 1.44903, Accuracy = 0.46580\n",
-      "[Epoch 9 train]: 100%|██████████| 391/391 [00:15<00:00, 25.97it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:54,342 INFO: Training - Epoch 10 - LogMetricByEpochHook: Loss = 1.41120\n",
-      "[Epoch 9 val]: 100%|██████████| 79/79 [00:02<00:00, 28.05it/s]\n",
-      "colossalai - rank_0 - 2021-10-15 03:30:57,332 INFO: Testing - Epoch 10 - LogMetricByEpochHook: Loss = 1.41242, Accuracy = 0.48500\n"
-     ]
-    }
-   ],
-   "source": [
-    "num_epochs = 10\n",
-    "test_interval = 1\n",
-    "trainer.fit(\n",
-    "        train_dataloader=trainloader,\n",
-    "        test_dataloader=testloader,\n",
-    "        max_epochs=num_epochs,\n",
-    "        display_progress=True,\n",
-    "        test_interval=test_interval\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "name": "colossal_cifar_demo.ipynb",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/examples/resnet_cifar10_data_parallel/README.md b/examples/resnet_cifar10_data_parallel/README.md
new file mode 100644
index 000000000..b4b46bad4
--- /dev/null
+++ b/examples/resnet_cifar10_data_parallel/README.md
@@ -0,0 +1,50 @@
+# Train ResNet34 on CIFAR10
+
+## Prepare Dataset
+
+In the script, we used CIFAR10 dataset provided by the `torchvision` library. The code snippet is shown below:
+
+```python
+train_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose(
+            [
+                transforms.RandomCrop(size=32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+```
+
+Firstly, you need to specify where you want to store your CIFAR10 dataset by setting the environment variable `DATA`. 
+
+```bash
+export DATA=/path/to/data
+
+# example
+# this will store the data in the current directory
+export DATA=$PWD/data
+```
+
+The `torchvison` module will download the data automatically for you into the specified directory.
+
+
+## Run training
+
+We provide two examples of training resnet 34 on the CIFAR10 dataset. One example is with engine and the other is 
+with the trainer. You can invoke the training script by the following command. This batch size and learning rate 
+are for a single GPU. Thus, in the following command, `nproc_per_node` is 1, which means there is only one process 
+invoked. If you change `nproc_per_node`, you will have to change the learning rate accordingly as the global batch
+size has changed.
+
+```bash
+# with engine
+python -m torch.distributed.launch --nproc_per_node 1 run_resnet_cifar10_with_engine.py
+
+# with trainer
+python -m torch.distributed.launch --nproc_per_node 1 run_resnet_cifar10_with_trainer.py
+```
\ No newline at end of file
diff --git a/examples/resnet_cifar10_data_parallel/config.py b/examples/resnet_cifar10_data_parallel/config.py
new file mode 100644
index 000000000..ea7d661b4
--- /dev/null
+++ b/examples/resnet_cifar10_data_parallel/config.py
@@ -0,0 +1,10 @@
+from colossalai.amp import AMP_TYPE
+
+BATCH_SIZE = 128
+NUM_EPOCHS = 200
+
+CONFIG = dict(
+    fp16=dict(
+        mode=AMP_TYPE.TORCH
+    )
+)
diff --git a/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_engine.py b/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_engine.py
new file mode 100644
index 000000000..c6fe56965
--- /dev/null
+++ b/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_engine.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+from colossalai.logging import get_dist_logger
+import colossalai
+import torch
+import os
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_dataloader
+from torchvision import transforms
+from colossalai.nn.lr_scheduler import CosineAnnealingLR
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+from tqdm import tqdm
+
+
+def main():
+    colossalai.launch_from_torch(config='./config.py',
+                                 host='localhost',
+                                 port=29500)
+
+    logger = get_dist_logger()
+
+    # build resnet
+    model = resnet34(num_classes=10)
+
+    # build dataloaders
+    train_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose(
+            [
+                transforms.RandomCrop(size=32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    test_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        train=False,
+        transform=transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    train_dataloader = get_dataloader(dataset=train_dataset,
+                                      shuffle=True,
+                                      batch_size=gpc.config.BATCH_SIZE,
+                                      num_workers=1,
+                                      pin_memory=True,
+                                      )
+
+    test_dataloader = get_dataloader(dataset=test_dataset,
+                                     add_sampler=False,
+                                     batch_size=gpc.config.BATCH_SIZE,
+                                     num_workers=1,
+                                     pin_memory=True,
+                                     )
+
+    # build criterion
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+    # lr_scheduler
+    lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                         optimizer,
+                                                                         criterion,
+                                                                         train_dataloader,
+                                                                         test_dataloader,
+                                                                         )
+
+    for epoch in range(gpc.config.NUM_EPOCHS):
+        engine.train()
+        if gpc.get_global_rank() == 0:
+            train_dl = tqdm(train_dataloader)
+        else:
+            train_dl = train_dataloader
+        for img, label in train_dl:
+            img = img.cuda()
+            label = label.cuda()
+
+            engine.zero_grad()
+            output = engine(img)
+            train_loss = engine.criterion(output, label)
+            engine.backward(train_loss)
+            engine.step()
+        lr_scheduler.step()
+
+        engine.eval()
+        correct = 0
+        total = 0
+        for img, label in test_dataloader:
+            img = img.cuda()
+            label = label.cuda()
+
+            with torch.no_grad():
+                output = engine(img)
+                test_loss = engine.criterion(output, label)
+            pred = torch.argmax(output, dim=-1)
+            correct += torch.sum(pred == label)
+            total += img.size(0)
+
+        logger.info(
+            f"Epoch {epoch} - train loss: {train_loss:.5}, test loss: {test_loss:.5}, acc: {correct / total:.5}, lr: {lr_scheduler.get_last_lr()[0]:.5g}", ranks=[0])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_trainer.py b/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_trainer.py
new file mode 100644
index 000000000..6ceab738a
--- /dev/null
+++ b/examples/resnet_cifar10_data_parallel/run_resnet_cifar10_with_trainer.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+from colossalai.logging import get_dist_logger
+import colossalai
+import torch
+import os
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_dataloader, MultiTimer
+from torchvision import transforms
+from colossalai.trainer import hooks, Trainer
+from torchvision.datasets import CIFAR10
+from torchvision.models import resnet34
+from colossalai.nn import CosineAnnealingLR
+from tqdm import tqdm
+
+
+def main():
+    colossalai.launch_from_torch(config='./config.py',
+                                 host='localhost',
+                                 port=29500)
+
+    logger = get_dist_logger()
+
+    # build resnet
+    model = resnet34(num_classes=10)
+
+    # build dataloaders
+    train_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform=transforms.Compose(
+            [
+                transforms.RandomCrop(size=32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    test_dataset = CIFAR10(
+        root=Path(os.environ['DATA']),
+        train=False,
+        transform=transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[
+                    0.2023, 0.1994, 0.2010]),
+            ]
+        )
+    )
+
+    train_dataloader = get_dataloader(dataset=train_dataset,
+                                      shuffle=True,
+                                      batch_size=gpc.config.BATCH_SIZE,
+                                      num_workers=1,
+                                      pin_memory=True,
+                                      )
+
+    test_dataloader = get_dataloader(dataset=test_dataset,
+                                     add_sampler=False,
+                                     batch_size=gpc.config.BATCH_SIZE,
+                                     num_workers=1,
+                                     pin_memory=True,
+                                     )
+
+    # build criterion
+    criterion = torch.nn.CrossEntropyLoss()
+
+    # optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+
+    # lr_scheduler
+    lr_scheduler = CosineAnnealingLR(optimizer, total_steps=gpc.config.NUM_EPOCHS)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(model,
+                                                                         optimizer,
+                                                                         criterion,
+                                                                         train_dataloader,
+                                                                         test_dataloader,
+                                                                         )
+    # build a timer to measure time
+    timer = MultiTimer()
+
+    # create a trainer object
+    trainer = Trainer(
+        engine=engine,
+        timer=timer,
+        logger=logger
+    )
+
+    # define the hooks to attach to the trainer
+    hook_list = [
+        hooks.LossHook(),
+        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
+        hooks.AccuracyHook(),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.LogMemoryByEpochHook(logger),
+        hooks.LogTimingByEpochHook(timer, logger),
+
+        # you can uncomment these lines if you wish to use them
+        # hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+        # hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
+    ]
+
+    # start training
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        epochs=gpc.config.NUM_EPOCHS,
+        test_dataloader=test_dataloader,
+        test_interval=1,
+        hooks=hook_list,
+        display_progress=True
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/run_trainer.py b/examples/run_trainer.py
deleted file mode 100644
index 04cfb1c8f..000000000
--- a/examples/run_trainer.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.trainer import Trainer
-
-
-def run_trainer():
-    engine, train_dataloader, test_dataloader = colossalai.initialize()
-    logger = get_dist_logger()
-    engine.schedule.data_sync = False
-
-    logger.info("engine is built", ranks=[0])
-
-    trainer = Trainer(engine=engine,
-                      verbose=True)
-    logger.info("trainer is built", ranks=[0])
-
-    logger.info("start training", ranks=[0])
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        test_dataloader=test_dataloader,
-        epochs=gpc.config.num_epochs,
-        hooks_cfg=gpc.config.hooks,
-        display_progress=True,
-        test_interval=2
-    )
-
-
-if __name__ == '__main__':
-    run_trainer()
diff --git a/examples/vit-b16/README.md b/examples/vit-b16/README.md
deleted file mode 100644
index 6d5439523..000000000
--- a/examples/vit-b16/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Overview
-
-Here is an example of training ViT-B/16 on Imagenet-1K with batch size 32K.
-We use 8x NVIDIA A100 GPU in this example. 
-
-# How to run
-Using [Slurm](https://slurm.schedmd.com/documentation.html):
-```shell
-srun python train_dali.py --local_rank=$SLURM_PROCID --world_size=$SLURM_NPROCS --host=$HOST --port=29500 --config=vit-b16.py
-```
-
-# Results
-
-![Loss Curve](./loss.jpeg)
-![Accuracy](./acc.jpeg)
-
-# Details
-`vit-b16.py`
-
-It is a [config file](https://colossalai.org/config.html), which is used by ColossalAI to define all kinds of training arguments, such as the model, dataset, and training method (optimizer, lr_scheduler, epoch, etc.). You can access config content by `gpc.config`.
-
-In this example, we train the ViT-Base patch 16 model 300 epochs on ImageNet-1K. The batch size is set to 32K through data parallel (4K on each GPU from 16x gradient accumulation with batch size 256). Since the batch size is very large than common usage, leading to convergence difficulties, we use a 
-large batch optimizer [LAMB](https://arxiv.org/abs/1904.00962), and we can scale the batch size to 32K with a little accuracy loss. The learning rate and weight decay of the optimizer are set to 1.8e-2 and 0.1, respectively. We use a linear warmup learning rate scheduler and warmup 150 epochs.
-We introduce FP16 mixed precision to accelerate training and use gradient clipping to help convergence.
-For simplicity and speed, we didn't apply `RandAug` and just used [Mixup](https://arxiv.org/abs/1710.09412) in data augmentation.
-
-If you have enough computing resources, you can expand this example conveniently with data parallel on a very large scale without gradient accumulation, and finish the training process even within one hour.
-
-
-`imagenet_dali_dataloader.py`
-To accelerate the training process, we use [DALI](https://github.com/NVIDIA/DALI) as data loader. Note that it requires the dataset in TFRecord format, avoiding read raw images which reduces efficiency of the file system.
-
-`train_dali.py`
-We build the DALI data loader and train process using Colossal-AI here.
-
-`mixup.py`
-Since we used Mixup, we define mixup loss in this file.
-
-`hooks.py`
-We also define useful hooks to log information help debugging.
diff --git a/examples/vit-b16/hooks.py b/examples/vit-b16/hooks.py
deleted file mode 100644
index b6c306ed7..000000000
--- a/examples/vit-b16/hooks.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from colossalai.registry import HOOKS
-from colossalai.trainer import BaseHook
-from colossalai.core import global_context as gpc
-from colossalai.context import ParallelMode
-
-
-@HOOKS.register_module
-class TotalBatchsizeHook(BaseHook):
-    def __init__(self, trainer, priority: int = 2) -> None:
-        super().__init__(trainer, priority)
-
-    def before_train(self):
-        total_batch_size = gpc.config.BATCH_SIZE * \
-            gpc.config.engine.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
-        self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])
diff --git a/examples/vit-b16/train_dali.py b/examples/vit-b16/train_dali.py
deleted file mode 100644
index 31bd3be4d..000000000
--- a/examples/vit-b16/train_dali.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import glob
-import os
-import colossalai
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-from colossalai.trainer import Trainer
-from colossalai.utils import set_global_multitimer_status
-from dataloader.imagenet_dali_dataloader import DaliDataloader
-
-
-def build_dali_train():
-    root = gpc.config.dali.root
-    train_pat = os.path.join(root, 'train/*')
-    train_idx_pat = os.path.join(root, 'idx_files/train/*')
-    return DaliDataloader(
-        sorted(glob.glob(train_pat)),
-        sorted(glob.glob(train_idx_pat)),
-        batch_size=gpc.config.BATCH_SIZE,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=True,
-        gpu_aug=gpc.config.dali.gpu_aug,
-        cuda=True,
-        mixup_alpha=gpc.config.dali.mixup_alpha
-    )
-
-
-def build_dali_test():
-    root = gpc.config.dali.root
-    val_pat = os.path.join(root, 'validation/*')
-    val_idx_pat = os.path.join(root, 'idx_files/validation/*')
-    return DaliDataloader(
-        sorted(glob.glob(val_pat)),
-        sorted(glob.glob(val_idx_pat)),
-        batch_size=gpc.config.BATCH_SIZE,
-        shard_id=gpc.get_local_rank(ParallelMode.DATA),
-        num_shards=gpc.get_world_size(ParallelMode.DATA),
-        training=False,
-        # gpu_aug=gpc.config.dali.gpu_aug,
-        gpu_aug=False,
-        cuda=True,
-        mixup_alpha=gpc.config.dali.mixup_alpha
-    )
-
-
-def main():
-    engine, train_dataloader, test_dataloader = colossalai.initialize(
-        train_dataloader=build_dali_train,
-        test_dataloader=build_dali_test
-    )
-    logger = get_dist_logger()
-    set_global_multitimer_status(True)
-    timer = colossalai.utils.get_global_multitimer()
-    trainer = Trainer(engine=engine,
-                      verbose=True,
-                      timer=timer)
-
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        test_dataloader=test_dataloader,
-        epochs=gpc.config.NUM_EPOCHS,
-        hooks_cfg=gpc.config.hooks,
-        display_progress=True,
-        test_interval=1
-    )
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/vit-b16/vit-b16.py b/examples/vit-b16/vit-b16.py
deleted file mode 100755
index ac51e226e..000000000
--- a/examples/vit-b16/vit-b16.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from colossalai.engine import AMP_TYPE
-from torch.nn import CrossEntropyLoss
-from mixup import MixupLoss
-from hooks import TotalBatchsizeHook
-from colossalai.registry import MODELS
-from timm.models import vit_base_patch16_224
-
-MODELS.register_module(vit_base_patch16_224)
-
-LOG_NAME = 'vit-b16-1k-32k-mixup-light2'
-# ViT Base
-BATCH_SIZE = 256
-DROP_RATE = 0.1
-NUM_EPOCHS = 300
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=1, mode=None),
-)
-
-optimizer = dict(
-    type='Lamb',
-    lr=1.8e-2,
-    weight_decay=0.1,
-)
-
-
-loss = dict(
-    type='MixupLoss',
-    loss_fn_cls=CrossEntropyLoss
-)
-
-model = dict(
-    type='vit_base_patch16_224',
-    drop_rate=DROP_RATE,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='AccuracyHook'),
-    dict(type='LossHook'),
-    dict(type='TotalBatchsizeHook'),
-    dict(type='TensorboardHook', log_dir=f'./tb_logs/{LOG_NAME}'),
-    dict(type='SaveCheckpointHook', interval=1,
-         checkpoint_dir=f'./ckpt/{LOG_NAME}'),
-    # dict(type='LoadCheckpointHook', epoch=10,
-    #      checkpoint_dir=f'./ckpt/{LOG_NAME}'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=150
-        )
-    ),
-]
-
-fp16 = dict(
-    mode=AMP_TYPE.TORCH,
-)
-
-
-logging = dict(
-    root_path=f"./logs/{LOG_NAME}"
-)
-
-dali = dict(
-    root='./dataset/ILSVRC2012_1k',
-    gpu_aug=True,
-    mixup_alpha=0.2
-)
-
-engine = dict(
-    schedule=None,
-    gradient_handlers=None,
-    gradient_accumulation=16,
-    gradient_clipping=1.0,
-)
diff --git a/examples/vit_b16_imagenet_data_parallel/README.md b/examples/vit_b16_imagenet_data_parallel/README.md
new file mode 100644
index 000000000..4a7203832
--- /dev/null
+++ b/examples/vit_b16_imagenet_data_parallel/README.md
@@ -0,0 +1,90 @@
+# Overview
+
+A common way to speed up AI model training is to implement large-batch training with the help of data parallelism, but this requires expensive supercomputer clusters. In this example, we used a small server with only 4 GPUs to reproduce the large-scale pre-training of Vision Transformer (ViT) on ImageNet-1K in 14 hours.
+
+# How to run
+
+On a single server, you can directly use torch.distributed to start pre-training on multiple GPUs in parallel. In Colossal-AI, we provided several launch methods to init the distributed backend. You can use `colossalai.launch` and `colossalai.get_default_parser` to pass the parameters via command line. If you happen to use launchers such as SLURM, OpenMPI and PyTorch launch utility, you can use `colossalai.launch_from_<torch/slurm/openmpi>` to read rank and world size from the environment variables directly for convenience. In this example, we use `launch_from_slurm` for demo purpose. You can check out more information about SLURM [here](https://slurm.schedmd.com/documentation.html).
+
+```shell
+HOST=<node name> srun bash ./scripts/train_slurm.sh
+```
+
+---
+
+If you are using `colossalai.launch`, do this:
+In your training script:
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch(config=args.config,
+                    rank=args.rank,
+                    world_size=args.world_size,
+                    host=args.host,
+                    port=args.port,
+                    backend=args.backend
+                    )
+```
+
+In your terminal:
+```shell
+<some_launcher> python train.py --config ./config.py --rank <rank> --world_size <world_size> --host <node name> --port 29500
+```
+---
+If you are using `colossalai.launch_from_torch`, do this:
+In your training script:
+
+```python
+# initialize distributed setting
+parser = colossalai.get_default_parser()
+args = parser.parse_args()
+colossalai.launch_from_torch(config=args.config,
+                  host=args.host,
+                  port=args.port,
+                  backend=args.backend
+                  )
+```
+
+In your terminal
+```shell
+python -m torch.distributed.launch --nproc_per_node <world_size> train.py --config ./config.py --host <node name> --port 29500
+```
+
+# Experiments
+To facilitate more people to reproduce the experiments with large-scale data parallel, we pre-trained ViT-Base/32 in only 14.58 hours on a small server with 4 NVIDIA A100 GPUs using ImageNet-1K dataset with batch size 32K for 300 epochs maintaining accuracy. For more complex pre-training of ViT-Base/16 and ViT-Large/32, it also takes only 78.58 hours and 37.83 hours to complete. Since the server used in this example is not a standard NVIDIA DGX A100 supercomputing unit, perhaps a better acceleration can be obtained on more professional hardware.
+
+![Loss Curve](./results/loss.jpeg)
+![Accuracy](./results/acc.jpeg)
+
+As can be seen from the above figure, the ViT model eventually converges well after training 300 epochs. It is worth noting that, unlike the common small-batch training convergence process, the model performance has a temporary decline in the middle of the large-batch training process. This is due to the difficulty of convergence in large-batch training. As the number of iterations is reduced, a larger learning rate is needed to ensure the final convergence. Since we did not carefully adjust the parameters, perhaps other parameter settings could get better convergence.
+
+# Details
+`config.py`
+
+This is a [configuration file](https://colossalai.org/config.html) that defines hyperparameters and trainign scheme (fp16, gradient accumulation, etc.). The config content can be accessed through `gpc.config` in the program.
+
+In this example, we trained ViT-Base/16 for 300 epochs on the ImageNet-1K dataset. The batch size is expanded to 32K through data parallelism. Since only 4 A100 GPUs on one small server are used, and the GPU memory is limited, the batch size of 32K cannot be used directly. Therefore, the batch size used on each GPU is only 256, and the 256 batch size is equivalently expanded to 8K through gradient accumulation 32 times. Finally, data parallelism is used between 4 GPUs to achieve an equivalent batch size of 32K.
+
+Since the batch size of 32K far exceeds the use range of common optimizers and is difficult to train, we use the large-batch optimizer [LAMB](https://arxiv.org/abs/1904.00962) provided by Colossal-AI to achieve a better convergence. The learning rate and weight decay of [LAMB](https://arxiv.org/abs/1904.00962) are set to 1.8e-2 and 0.1, respectively. The learning rate scheduler uses a linear warmup strategy of 150 epochs. We also used FP16 mixed precision to speed up the training process, and introduced gradient clipping to help convergence. For simplicity and speed, we only use [Mixup](https://arxiv.org/abs/1710.09412) instead of `RandAug` in data augmentation.
+
+By tuning the parallelism, this example can be quickly deployed to a single server with several GPUs or to a large cluster with lots of nodes and GPUs. If there are enough computing resources to allow data parallel to be directly extended to hundreds or even thousands of GPUs, the training process of several days on a single A100 GPU can be shortened to less than half an hour.
+
+`imagenet_dali_dataloader.py`
+
+To accelerate the training process, we use [DALI](https://github.com/NVIDIA/DALI) to read data and require the dataset to be in TFRecord format, which avoids directly reading a large number of raw image files and being limited by the efficiency of the file system.
+
+`train.py`
+
+We call DALI in this file to read data and start the training process using Colossal-AI.
+
+`mixup.py`
+
+Since Mixup is used as data augmentation, we define the loss function of Mixup here.
+
+`myhooks.py`
+We define hook functions that record running information to help debugging.
+
+# How to build TFRecords dataset
+
+As we use [DALI](https://github.com/NVIDIA/DALI) to read data, we use the TFRecords dataset instead of raw Imagenet dataset. If you don't have TFRecords dataset, follow [imagenet-tools](https://github.com/ver217/imagenet-tools) to build one.
\ No newline at end of file
diff --git a/examples/vit_b16_imagenet_data_parallel/config.py b/examples/vit_b16_imagenet_data_parallel/config.py
new file mode 100755
index 000000000..cf7b10f87
--- /dev/null
+++ b/examples/vit_b16_imagenet_data_parallel/config.py
@@ -0,0 +1,21 @@
+from colossalai.amp import AMP_TYPE
+
+
+# ViT Base
+BATCH_SIZE = 256
+DROP_RATE = 0.1
+NUM_EPOCHS = 300
+
+fp16 = dict(
+    mode=AMP_TYPE.TORCH,
+)
+
+gradient_accumulation = 16
+gradient_clipping = 1.0
+
+dali = dict(
+    # root='./dataset/ILSVRC2012_1k',
+    root='/project/scratch/p200012/dataset/ILSVRC2012_1k',
+    gpu_aug=True,
+    mixup_alpha=0.2
+)
diff --git a/examples/vit-b16/dataloader/__init__.py b/examples/vit_b16_imagenet_data_parallel/dataloader/__init__.py
similarity index 100%
rename from examples/vit-b16/dataloader/__init__.py
rename to examples/vit_b16_imagenet_data_parallel/dataloader/__init__.py
diff --git a/examples/vit-b16/dataloader/imagenet_dali_dataloader.py b/examples/vit_b16_imagenet_data_parallel/dataloader/imagenet_dali_dataloader.py
similarity index 100%
rename from examples/vit-b16/dataloader/imagenet_dali_dataloader.py
rename to examples/vit_b16_imagenet_data_parallel/dataloader/imagenet_dali_dataloader.py
diff --git a/examples/vit-b16/mixup.py b/examples/vit_b16_imagenet_data_parallel/mixup.py
similarity index 100%
rename from examples/vit-b16/mixup.py
rename to examples/vit_b16_imagenet_data_parallel/mixup.py
diff --git a/examples/vit_b16_imagenet_data_parallel/myhooks.py b/examples/vit_b16_imagenet_data_parallel/myhooks.py
new file mode 100644
index 000000000..15f1f2e46
--- /dev/null
+++ b/examples/vit_b16_imagenet_data_parallel/myhooks.py
@@ -0,0 +1,15 @@
+from colossalai.trainer.hooks import BaseHook
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
+from colossalai.logging import get_dist_logger
+
+
+class TotalBatchsizeHook(BaseHook):
+    def __init__(self, priority: int = 2) -> None:
+        super().__init__(priority)
+        self.logger = get_dist_logger()
+
+    def before_train(self, trainer):
+        total_batch_size = gpc.config.BATCH_SIZE * \
+            gpc.config.gradient_accumulation * gpc.get_world_size(ParallelMode.DATA)
+        self.logger.info(f'Total batch size = {total_batch_size}', ranks=[0])
diff --git a/examples/vit-b16/acc.jpeg b/examples/vit_b16_imagenet_data_parallel/results/acc.jpeg
similarity index 100%
rename from examples/vit-b16/acc.jpeg
rename to examples/vit_b16_imagenet_data_parallel/results/acc.jpeg
diff --git a/examples/vit-b16/loss.jpeg b/examples/vit_b16_imagenet_data_parallel/results/loss.jpeg
similarity index 100%
rename from examples/vit-b16/loss.jpeg
rename to examples/vit_b16_imagenet_data_parallel/results/loss.jpeg
diff --git a/examples/vit_b16_imagenet_data_parallel/scripts/train_slurm.sh b/examples/vit_b16_imagenet_data_parallel/scripts/train_slurm.sh
new file mode 100644
index 000000000..870d163fe
--- /dev/null
+++ b/examples/vit_b16_imagenet_data_parallel/scripts/train_slurm.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+python train.py --host $HOST --config ./config.py --port 29500
\ No newline at end of file
diff --git a/examples/vit_b16_imagenet_data_parallel/train.py b/examples/vit_b16_imagenet_data_parallel/train.py
new file mode 100644
index 000000000..5f88940ba
--- /dev/null
+++ b/examples/vit_b16_imagenet_data_parallel/train.py
@@ -0,0 +1,116 @@
+import glob
+from math import log
+import os
+import colossalai
+import torch
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_dist_logger
+from colossalai.trainer import Trainer, hooks
+from colossalai.nn.lr_scheduler import LinearWarmupLR
+from dataloader.imagenet_dali_dataloader import DaliDataloader
+from mixup import MixupLoss
+from timm.models import vit_base_patch16_224
+from myhooks import TotalBatchsizeHook
+
+
+def build_dali_train():
+    root = gpc.config.dali.root
+    train_pat = os.path.join(root, 'train/*')
+    train_idx_pat = os.path.join(root, 'idx_files/train/*')
+    return DaliDataloader(
+        sorted(glob.glob(train_pat)),
+        sorted(glob.glob(train_idx_pat)),
+        batch_size=gpc.config.BATCH_SIZE,
+        shard_id=gpc.get_local_rank(ParallelMode.DATA),
+        num_shards=gpc.get_world_size(ParallelMode.DATA),
+        training=True,
+        gpu_aug=gpc.config.dali.gpu_aug,
+        cuda=True,
+        mixup_alpha=gpc.config.dali.mixup_alpha
+    )
+
+
+def build_dali_test():
+    root = gpc.config.dali.root
+    val_pat = os.path.join(root, 'validation/*')
+    val_idx_pat = os.path.join(root, 'idx_files/validation/*')
+    return DaliDataloader(
+        sorted(glob.glob(val_pat)),
+        sorted(glob.glob(val_idx_pat)),
+        batch_size=gpc.config.BATCH_SIZE,
+        shard_id=gpc.get_local_rank(ParallelMode.DATA),
+        num_shards=gpc.get_world_size(ParallelMode.DATA),
+        training=False,
+        # gpu_aug=gpc.config.dali.gpu_aug,
+        gpu_aug=False,
+        cuda=True,
+        mixup_alpha=gpc.config.dali.mixup_alpha
+    )
+
+
+def main():
+    # initialize distributed setting
+    parser = colossalai.get_default_parser()
+    args = parser.parse_args()
+    colossalai.launch_from_slurm(config=args.config,
+                                 host=args.host,
+                                 port=args.port,
+                                 backend=args.backend
+                                 )
+
+    # get logger
+    logger = get_dist_logger()
+    logger.info("initialized distributed environment", ranks=[0])
+
+    # build model
+    model = vit_base_patch16_224(drop_rate=0.1)
+
+    # build dataloader
+    train_dataloader = build_dali_train()
+    test_dataloader = build_dali_test()
+
+    # build optimizer
+    optimizer = colossalai.nn.Lamb(model.parameters(), lr=1.8e-2, weight_decay=0.1)
+
+    # build loss
+    criterion = MixupLoss(loss_fn_cls=torch.nn.CrossEntropyLoss)
+
+    # lr_scheduelr
+    lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=50, total_steps=gpc.config.NUM_EPOCHS)
+
+    engine, train_dataloader, test_dataloader, _ = colossalai.initialize(
+        model, optimizer, criterion, train_dataloader, test_dataloader
+    )
+    logger.info("initialized colossalai components", ranks=[0])
+
+    # build trainer
+    trainer = Trainer(engine=engine, logger=logger)
+
+    # build hooks
+    hook_list = [
+        hooks.LossHook(),
+        hooks.AccuracyHook(),
+        hooks.LogMetricByEpochHook(logger),
+        hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
+        TotalBatchsizeHook(),
+
+        # comment if you do not need to use the hooks below
+        hooks.SaveCheckpointHook(interval=1, checkpoint_dir='./ckpt'),
+        hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
+    ]
+
+    # start training
+    trainer.fit(
+        train_dataloader=train_dataloader,
+        test_dataloader=test_dataloader,
+        epochs=gpc.config.NUM_EPOCHS,
+        hooks=hook_list,
+        display_progress=True,
+        test_interval=1
+    )
+
+
+if __name__ == '__main__':
+    main()