update examples and sphnix docs for the new api (#63)

2021-12-13 22:07:01 +08:00 · 2021-12-13 22:07:01 +08:00 · 35813ed3c4
parent 7d3711058f
commit 35813ed3c4
124 changed files with 1251 additions and 1462 deletions
--- a/README.md
+++ b/README.md
@ -14,10 +14,12 @@ Blog: [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Trai
 pip install colossalai
 ```

-### Install From Source
+### Install From Source (Recommended)
+
+> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)

 ```shell
-git clone git@github.com:hpcaitech/ColossalAI.git
+git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 # install dependency
 pip install -r requirements/requirements.txt
@ -64,8 +66,8 @@ model = ...
 # sampler by default
 train_dataset = ... 
 train_dataloader = get_dataloader(dataset=dataset,
-                            shuffle=True,
-                            )
+                                shuffle=True,
+                                )


 # build your 
--- a/colossalai/amp/init.py
+++ b/colossalai/amp/init.py
@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
                   criterion: _Loss,
                   mode: AMP_TYPE,
                   amp_config: Config = None):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param mode: amp mode
+    :type mode: :class:`colossalai.amp.AMP_TYPE`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
    assert isinstance(mode, AMP_TYPE), \
        f'expected the argument mode be AMP_TYPE, but got {type(mode)}'

--- a/colossalai/amp/apex_amp/init.py
+++ b/colossalai/amp/apex_amp/init.py
@ -7,6 +7,18 @@ import apex.amp as apex_amp
 def convert_to_apex_amp(model: nn.Module,
                        optimizer: Optimizer,
                        amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for nvidia apex
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
    optimizer = ApexAMPOptimizer(optimizer)
    return model, optimizer
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32


 class ApexAMPOptimizer(ColossalaiOptimizer):
+    ''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
+    methods
+    '''

    def backward(self, loss: Tensor):
+        """
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
            scaled_loss.backward()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: the max norm value for gradient clipping
+        :type max_norm: float
+        """
        if max_norm > 0:
            clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
--- a/colossalai/amp/naive_amp/init.py
+++ b/colossalai/amp/naive_amp/init.py
@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 def convert_to_naive_amp(model: nn.Module,
                         optimizer: Optimizer,
                         amp_config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param amp_config: configuration for naive mode amp
+    :type amp_config: :class:`colossalai.context.Config` or dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    if is_no_pp_or_last_stage():
        model = NaiveAMPModel(model, output_to_fp32=True)
    else:
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@ -146,26 +146,22 @@ class DynamicGradScaler:
 class FP16Optimizer(Optimizer):
    """Float16 optimizer for fp16 and bf16 data types.

-    Arguments:
-        optimizer: base optimizer such as Adam or SGD
-        clip_grad: clip gradeints with this global L2 norm. Note
-            that clipping is ignored if clip_grad == 0
-        log_num_zeros_in_grad: return number of zeros in the gradients.
-        params_have_main_grad: flag indicating if parameters have
-            a `main_grad` field. If this is set, we are assuming
-            that the model parameters are store in the `main_grad`
-            field instead of the typical `grad` field. This happens
-            for the DDP cases where there is a contihuous buffer
-            holding the gradients. For example for bfloat16, we want
-            to do gradient accumulation and all-reduces in float32
-            and as a result we store those gradients in the main_grad.
-            Note that main grad is not necessarily in float32.
-        bf16: if true, the model is running in bfloat16.
-        grad_scaler: used for scaling gradients. Note that this can be
-            None. This case happens when `bf16 = True` and we don't
-            use any loss scale. Note that for `bf16 = True`, we can have
-            a constnat gradient scaler. Also for `bf16 = False`, we
-            always require a grad scaler.
+    :param optimizer: base optimizer such as Adam or SGD
+    :type optimizer: torch.optim.Optimizer
+    :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
+    :type param clip_grad: float
+    :param log_num_zeros_in_grad: return number of zeros in the gradients.
+    :type log_num_zeros_in_grad: bool
+    :param initial_scale: initial scale of gradient scaler
+    :type initial_scale: int
+    :param growth_factor: the growth rate of loss scale
+    :type growth_factor: int
+    :param backoff_factor: the decrease rate of loss scale
+    :type backoff_factor: float
+    :param hysterisis: delay shift in dynamic loss scaling
+    :type hysterisis: int
+    :param max_scale: maximum loss scale allowed
+    :type max_scale: int
    """

    def __init__(self,
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer


 class NaiveAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class for optimizer to cast all parameters to fp16
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
        optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
        super().__init__(optim)

    def backward(self, loss: Tensor):
+        """backward with gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        loss = self.optim.scale_loss(loss)
        loss.backward()

@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):


 class NaiveAMPModel(nn.Module):
+    """A wrapper class for model to cast the model into fp16 and 
+    automatically cast the input and output
+    """

    def __init__(self,
                 model: nn.Module,
--- a/colossalai/amp/torch_amp/init.py
+++ b/colossalai/amp/torch_amp/init.py
@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
                         optimizer: Optimizer,
                         criterion: _Loss,
                         amp_config: Config):
+    """A helper function to wrap training components with Torch AMP modules
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimzer`
+    :param criterion: your loss function object
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param amp_config: configuration for different amp modes
+    :type amp_config: :class:`colossalai.context.Config` or dict
+    
+    :return: (model, optimizer, criterion)
+    :rtype: Tuple
+    """
    model = TorchAMPModel(model)
    optimizer = TorchAMPOptimizer(optimizer, **amp_config)
    criterion = TorchAMPLoss(criterion)
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32


 class TorchAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class which integrate pytorch amp with an optimizer
+
+    :param optim: a normal optimizer like Adam or SGD
+    :type optim: torch.optim.Optimizer
+    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
        super().__init__(optim)
        self.scaler = GradScaler(*args, **kwargs)

    def backward(self, loss: Tensor):
+        """backward with torch amp gradient scaler
+        :param loss: loss computed by a loss function
+        :type loss: torch.Tensor
+        """
        self.scaler.scale(loss).backward()

    def step(self):
+        """update the parameters of the model
+        """
        self.scaler.step(self.optim)
        self.scaler.update()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """apply gradient clipping to the model parameters
+        :param model: your model object
+        :type model: torch.nn.Module
+        :param max_norm: max norm value for gradient clipping
+        :type max_norm: float
+        """
        if max_norm > 0.0:
            self.scaler.unscale_(self.optim)
            clip_grad_norm_fp32(model.parameters(), max_norm)


 class TorchAMPModel(nn.Module):
+    """A wrapper class for a model object which executes forward with values automatically
+    cast to fp16
+    """

    def __init__(self, model: nn.Module) -> None:
        super().__init__()
@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):


 class TorchAMPLoss(nn.Module):
-
+    """A wrapper class for a criterion object which computes the loss in mixed-precision context
+    :param loss: a loss function object
+    :type loss: torch.nn.modules.loss._Loss
+    """
    def __init__(self, loss: _Loss):
        super().__init__()
        self.loss = loss
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@ -16,8 +16,8 @@ def build_from_config(module, config: dict):
        of the return object
    :type config: dict
    :raises AssertionError: Raises an AssertionError if `module` is not a class
-    :return: An object of :class:`module`
-    :rtype: :class:`module`
+    :return: An object of interest
+    :rtype: Object
    """
    assert inspect.isclass(module), 'module must be a class'
    return module(**config)
@ -62,8 +62,8 @@ def build_layer(config):
    :param config: A python dict or a :class:`colossalai.context.Config` object
        containing information used in the construction of the return object
    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`nn.Module`
-    :rtype: :class:`nn.Module`
+    :return: An object of :class:`torch.nn.Module`
+    :rtype: :class:`torch.nn.Module`
    """
    return build_from_registry(config, LAYERS)

@ -75,8 +75,8 @@ def build_loss(config):
    :param config: A python dict or a :class:`colossalai.context.Config` object
        containing information used in the construction of the return object
    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.autograd.Function`
-    :rtype: :class:`torch.autograd.Function`
+    :return: An object of :class:`torch.nn.modules.loss._Loss`
+    :rtype: :class:`torch.nn.modules.loss._Loss`
    """
    return build_from_registry(config, LOSSES)

@ -87,8 +87,8 @@ def build_model(config):
    :param config: A python dict or a :class:`colossalai.context.Config` object
        containing information used in the construction of the return object
    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`nn.Module`
-    :rtype: :class:`nn.Module`
+    :return: An object of :class:`torch.nn.Module`
+    :rtype: :class:`torch.nn.Module`
    """
    return build_from_registry(config, MODELS)

@ -134,8 +134,8 @@ def build_gradient_handler(config, model, optimizer):
    :type model: :class:`nn.Module`
    :param optimizer: An optimizer object containing parameters for the gradient handler
    :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`BaseGradientHandler`
-    :rtype: :class:`BaseGradientHandler`
+    :return: An object of :class:`colossalai.engine.BaseGradientHandler`
+    :rtype: :class:`colossalai.engine.BaseGradientHandler`
    """
    config_ = config.copy()
    config_['model'] = model
@ -151,8 +151,8 @@ def build_hooks(config, trainer):
    :type config: dict or :class:`colossalai.context.Config`
    :param trainer: A :class:`Trainer` object containing parameters for the hook
    :type trainer: :class:`Trainer`
-    :return: An object of :class:`BaseHook`
-    :rtype: :class:`BaseHook`
+    :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
+    :rtype: :class:`colossalai.trainer.hooks.BaseHook`
    """
    config_ = config.copy()
    config_['trainer'] = trainer
@ -182,8 +182,8 @@ def build_data_sampler(config, dataset):
    :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
        used in the construction of the return object
    :type dataset: :class:`torch.utils.data.Dataset`
-    :return: An object of :class:`colossalai.nn.data.sampler.BaseSampler`
-    :rtype: :class:`colossalai.nn.data.sampler.BaseSampler`
+    :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
+    :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
    """
    config_ = config.copy()
    config_['dataset'] = dataset
@ -200,10 +200,6 @@ def build_lr_scheduler(config, optimizer):
    :param optimizer: An optimizer object containing parameters for the learning rate
        scheduler
    :type optimizer: :class:`torch.optim.Optimizer`
-    :param total_steps: Number of total steps of the learning rate scheduler
-    :type total_steps: int
-    :param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
-    :type num_steps_per_epoch: int
    :return: An object of :class:`torch.optim.lr_scheduler`
    :rtype: :class:`torch.optim.lr_scheduler`
    """
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@ -151,6 +151,28 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):


 class PipelineModelInitializer():
+    """An intializer to split the model into different stages for pipeline parallelism.
+
+    An example for the model config is shown below. The class VisionTransformerFromConfig should
+    inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
+    of layer configurations.
+
+    model_config = dict(
+        type='VisionTransformerFromConfig',
+        embedding_cfg=dict(...),
+        ...
+    )
+
+    :param config: configuration of the model
+    :type config: dict
+    :param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
+                        in most cases unless you are using virutal pipeline parallelism.
+    :type num_chunks: int
+    :param verbose: whether to print the logs
+    :type verbose: bool
+
+    """
+
    def __init__(self, config, num_chunks, verbose=False):
        self.num_chunks = num_chunks
        self.ori_model = build_model(config)
@ -161,6 +183,13 @@ class PipelineModelInitializer():
        self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])

    def initialize(self, partition_method='parameter'):
+        """Initialize the model object from the config passed
+
+        :param partition_method: this parameter determines how you want to split your model layers into stages,
+                                you can set it as 'layer' or 'parameter'
+        :type partition_method: str
+        
+        """
        # Some space for initializing comunication groups
        self._interval = None
        self._partition_layers(method=partition_method)
@ -183,7 +212,7 @@ class PipelineModelInitializer():
            # print_rank_0(param_counts)
            self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
        else:
-            assert method == 'layer', "Method should be a pre-set string"
+            raise ValueError("Method should be a pre-set string in [layer, parameter]")

        # Display the partition
        if gpc.get_global_rank() == 0 and self.verbose:
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
@ -18,11 +18,11 @@ def all_gather(tensor: Tensor, dim: int,
    :param tensor: Tensor to be gathered
    :param dim: The dimension concatenating in
    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor: Tensor
+    :type tensor: :class:`torch.Tensor`
    :type dim: int
-    :type parallel_mode: ParallelMode
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
    :return: The tensor generated by all-gather
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    depth = gpc.get_world_size(parallel_mode)
    temp = tensor.clone()
@ -54,11 +54,11 @@ def reduce_scatter(tensor: Tensor, dim: int,
    :param tensor: Tensor to be reduced and scattered
    :param dim: The dimension scattering in
    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor: Tensor
+    :type tensor: :class:`torch.Tensor`
    :type dim: int
-    :type parallel_mode: ParallelMode
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
    :return: The tensor generated by reduce-scatter
-    :rtype: Tensor
+    :rtype: :class:`Tensor`
    """
    depth = gpc.get_world_size(parallel_mode)
    # temp = list(torch.chunk(tensor, depth, dim=dim))
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
@ -96,7 +96,7 @@ def recv_forward(input_tensor_shape, prev_rank=None):
    :type input_tensor_shape: torch.Size
    :type prev_rank: int, optional
    :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_first_rank(ParallelMode.PIPELINE):
        input_tensor = None
@ -115,7 +115,7 @@ def recv_backward(output_grad_shape, next_rank=None):
    :type output_grad_shape: torch.Size
    :type next_rank: int, optional
    :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_last_rank(ParallelMode.PIPELINE):
        output_tensor_grad = None
@ -131,7 +131,7 @@ def send_forward(output_tensor, next_rank=None):

    :param output_tensor: Tensor to be sent
    :param next_rank: The rank of the recipient of the tensor
-    :type output_tensor: Tensor
+    :type output_tensor: :class:`torch.Tensor`
    :type next_rank: int, optional
    """
    if not gpc.is_last_rank(ParallelMode.PIPELINE):
@ -144,7 +144,7 @@ def send_backward(input_tensor_grad, prev_rank=None):

    :param input_tensor_grad: Tensor to be sent
    :param prev_rank: The rank of the recipient of the tensor
-    :type input_tensor_grad: Tensor
+    :type input_tensor_grad: :class:`torch.Tensor`
    :type prev_rank: int, optional
    """
    if not gpc.is_first_rank(ParallelMode.PIPELINE):
@ -162,10 +162,10 @@ def send_forward_recv_backward(output_tensor,

    :param output_tensor: Tensor to be sent
    :param output_grad_shape: The shape of the tensor to be recieved
-    :type output_tensor: Tensor
-    :type output_grad_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type output_grad_shape: :class:`torch.Size`
    :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_last_rank(ParallelMode.PIPELINE):
        output_tensor_grad = None
@ -187,10 +187,10 @@ def send_backward_recv_forward(input_tensor_grad,

    :param input_tensor_grad: Tensor to be sent
    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: Tensor
-    :type input_tensor_shape: torch.Size
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
    :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    if gpc.is_first_rank(ParallelMode.PIPELINE):
        input_tensor = None
@ -213,10 +213,10 @@ def send_forward_recv_forward(output_tensor,

    :param output_tensor: Tensor to be sent
    :param input_tensor_shape: The shape of the tensor to be recieved
-    :type output_tensor: Tensor
-    :type input_tensor_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
    :return: The input tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    input_tensor, _ = _communicate(tensor_send_next=output_tensor,
                                   recv_prev=recv_prev,
@ -237,10 +237,10 @@ def send_backward_recv_backward(input_tensor_grad,

    :param input_tensor_grad: Tensor to be sent
    :param output_grad_shape: The shape of the tensor to be recieved
-    :type input_tensor_grad: Tensor
-    :type output_grad_shape: torch.Size
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type output_grad_shape: :class:`torch.Size`
    :return: The grad of output tensor in forward step
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
                                         recv_next=recv_next,
@ -266,10 +266,10 @@ def send_forward_backward_recv_forward_backward(output_tensor,
    :param input_tensor_grad: Tensor sent to the previous
    :param input_tensor_shape: The shape of the tensor recieved from the previous
    :param output_grad_shape: The shape of the tensor recieved from the next
-    :type output_tensor: Tensor
-    :type input_tensor_grad: Tensor
-    :type input_tensor_shape: torch.Size
-    :type output_grad_shape: torch.Size
+    :type output_tensor: :class:`torch.Tensor`
+    :type input_tensor_grad: :class:`torch.Tensor`
+    :type input_tensor_shape: :class:`torch.Size`
+    :type output_grad_shape: :class:`torch.Size`
    :return: (the input tensor in forward step, the grad of output tensor in forward step)
    :rtype: (Tensor, Tensor)
    """
--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
@ -14,10 +14,10 @@ def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):

    :param tensor_send_next: Tensor sent to next member
    :param parallel_mode: Parallel group mode used in this communication
-    :type tensor_send_next: Tensor
-    :type parallel_mode: ParallelMode
+    :type tensor_send_next: :class:`torch.Tensor`
+    :type parallel_mode: :class:`colossalai.context.ParallelMode`
    :return: The tensor recieved from the previous
-    :rtype: Tensor
+    :rtype: :class:`torch.Tensor`
    """
    buffer_shape = tensor_send_next.size()

--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@ -433,6 +433,9 @@ class ParallelContext:

    def set_device(self, device_ordinal: int = None):
        """Sets distributed processes to be bound to devices.
+
+        :param device_ordinal: the device id to be bound to
+        :type device_ordinal: int
        """
        global_rank = self.get_global_rank()
        if device_ordinal is None:
@ -445,6 +448,9 @@ class ParallelContext:

    def set_seed(self, seed: int):
        """Sets seeds for all random libraries.
+
+        :param seed: seed for random states
+        :type seed: int
        """
        random.seed(seed)
        np.random.seed(seed)
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -57,38 +57,61 @@ class Engine:

    @property
    def model(self):
+        """model attached to the engine"""
        return self._model

    @property
    def optimizer(self):
+        """optimizer attached to the engine"""
        return self._optimizer

    @property
    def criterion(self):
+        """criterion attached to the engine"""
        return self._criterion

-    @property
-    def schedule(self):
-        return self._schedule
-
    def zero_grad(self):
+        """set the gradient of parameters to zero
+        """
        self.optimizer.zero_grad()

    def step(self):
+        """execute parameter update
+        """
        self._all_reduce_gradients()
        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
        self.optimizer.step()

    def backward(self, loss: Tensor):
+        """Start backward propagation given the loss value computed by a loss function
+        
+        :param loss: loss value computed by a loss function
+        :type loss: :class:`torch.Tensor`
+        """
        return self.optimizer.backward(loss)

    def backward_by_grad(self, tensor, grad):
+        """Start backward propagation given the gradient of the output tensor
+        
+        :param loss: output tensor
+        :type loss: :class:`torch.Tensor`
+        :param grad: gradient passed back to the output
+        :type grad: :class:`torch.Tensor`
+        """
        return self.optimizer.backward_by_grad(tensor, grad)

    def calc_loss(self, *args, **kwargs):
+        """compute the loss value
+        :return: the loss value
+        :rtype: :class:`torch.Tensor`
+        """
        return self.criterion(*args, **kwargs)

    def __call__(self, *args, **kwargs):
+        """run the forward step for the model
+        :return: output the model
+        :rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
+        """
        return self.model(*args, **kwargs)

    def _all_reduce_gradients(self):
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@ -48,7 +48,7 @@ class BaseSchedule(ABC):
        already in the same GPU as where the model's.

        :return: (data, label)
-        :rtype: (Tensor, Tensor)
+        :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
        """
        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@ -38,7 +38,9 @@ class NonPipelineSchedule(BaseSchedule):
        :type data_iter: Iterator
        :type forward_only: bool, optional
        :type return_loss: bool, optional
+        
        :return: (output, label, loss)
+        :rtype: Tuple[:class:`torch.Tensor`]
        """
        assert forward_only or return_loss, \
            "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@ -133,6 +133,16 @@ class PipelineSchedule(BaseSchedule):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
+
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param input_tensor: input tensor for this pipeline stage
+        :type input_tensor: :class:`torch.Tensor`
+        :param return_tensors: a list of tensors to return
+        :type return_tensors: List[:class:`torch.Tensor`]
+        
+        :return: output or the loss value of the current pipeline stage
+        :rtype: :class:`torch.Tensor`
        """

        if input_tensor is None:
@ -162,6 +172,18 @@ class PipelineSchedule(BaseSchedule):
        output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
        Returns the gradients with respect to the input tensor (None if first stage).
        This is a helper function and can be ignored by users.
+
+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param input_tensor: input tensor for this pipeline stage
+        :type input_tensor: :class:`torch.Tensor`
+        :param output_tensor: output tensor for this pipeline stage
+        :type output_tensor: :class:`torch.Tensor`
+        :param output_tensor_grad: gradient of output tensor for this pipeline stage
+        :type output_tensor_grad: :class:`torch.Tensor`
+
+        :return: gradient of input tensor
+        :rtype: :class:`torch.Tensor`
        """

        # Retain the grad on the input_tensor.
@ -189,7 +211,17 @@ class PipelineSchedule(BaseSchedule):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

+        :param engine: your engine object
+        :type engine: colossalai.engine.Engine
+        :param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
+        :type data_iter: Iterable
+        :param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
+        :type forward_only: bool
+        :param return_loss: whether returns the loss value. Default is true.
+        :type return_loss: bool
+
        :return: (output, label, loss)
+        :rtype: Tuple[:class:`torch.Tensor`]
        """

        assert forward_only or return_loss, \
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@ -82,6 +82,8 @@ def launch(config: Union[str, Path, Config, Dict],
    :param local_rank: rank for the process on the node and is used to set the default CUDA device,
    defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
    :type local_rank: int, optional
+    :param verbose: whether to print logs
+    :type verbose: bool
    :raises Exception: raise exception when config type is wrong
    '''
    gpc.verbose = verbose
@ -121,6 +123,20 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
                      backend: str = 'nccl',
                      seed: int = 1024,
                      verbose: bool = True):
+    '''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
+    set by SLURM
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
    rank = int(os.environ['SLURM_PROCID'])
    world_size = int(os.environ['SLURM_NPROCS'])
    launch(config=config,
@ -139,6 +155,20 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
                        backend: str = 'nccl',
                        seed: int = 1024,
                        verbose: bool = True):
+    '''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
+    set by OpenMPI
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
    world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@ -159,6 +189,20 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
                      backend: str = 'nccl',
                      seed: int = 1024,
                      verbose: bool = True):
+    '''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size 
+    from the environment variables set by PyTorch
+
+    :param config: config file or config file path are both acceptable
+    :type config: Union[str, dict, Config]
+    :param host: the master address for distributed training
+    :type host: str
+    :param port: the master port for distributed training
+    :type port: str
+    :param backend: backend for torch.distributed
+    :type backend: str
+    :param verbose: whether to print logs
+    :type verbose: bool
+    '''
    rank = int(os.environ['RANK'])
    local_rank = int(os.environ['LOCAL_RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
@ -184,16 +228,20 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
    ''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.

    :param model: your model instance
-    :type model: a single or a list of ``torch.nn.Module`` objects
+    :type model: :class:`torch.nn.Module`
    :param optimizer: your optimizer instance
-    :type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects
+    :type optimizer: :class:`torch.optim.optimizer.Optimizer`
    :param criterion: your criterion instance
-    :type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects
-    :param train_dataloader: dataloaders for training data
-    :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
-    :param train_dataloader: dataloaders for testing data
-    :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
-    :return: (engine, criterion, train_dataloader, test_dataloader)
+    :type criterion: :class:`torch.nn.modules.loss._Loss`
+    :param train_dataloader: dataloader for training data
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`
+    :param train_dataloader: dataloader for testing data
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`
+    :param lr_scheduler: your lr scheduler instance
+    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
+    :param verbose: whether to print logs
+    :type verbose: bool
+    :return: (engine, train_dataloader, test_dataloader, lr_scheduler)
    :rtype: tuple
    '''
    # get logger
--- a/colossalai/logging/init.py
+++ b/colossalai/logging/init.py
@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
 def get_dist_logger(name='root'):
    """Get logger instance based on name. The DistributedLogger will create singleton instances,
    which means that only one logger instance is created per name.
+
+    :param name: name of the logger, name must be unique
+    :type name: str
+
+    :return: a distributed logger instance
+    :rtype: :class:`colossalai.logging.DistributedLogger`
    """
    return DistributedLogger.get_instance(name=name)
--- a/colossalai/nn/layer/non_parallel_layers/_vit.py
+++ b/colossalai/nn/layer/non_parallel_layers/_vit.py
@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
@LAYERS.register_module
 class VanillaViTPatchEmbedding(nn.Module):
    """ 2D Image to Patch Embedding
+
+    :param img_size: image size
+    :type img_size: int
+    :param patch_size: size of a patch
+    :type patch_size: int
+    :param in_chans: input channels
+    :type in_chans: int
+    :param embed_dim: embedding dimension
+    :type embed_dim: int
+    :param norm_layer: layer norm class, defaults to None
+    :type norm_layer: Callable
+    :param flattern: whether flatten the output
+    :type flatten: bool
+    :param drop: dropout rate
+    :type drop: float
    """

-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.):
+    def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
@LAYERS.register_module
 class VanillaViTMLP(nn.Module):
    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+
+    :param in_features: input channels
+    :type in_features: int
+    :param hidden_features: channels of the output of the first dense layer
+    :type hidden_features: int
+    :param hidden_features: channels of the output of the second dense layer
+    :type hidden_features: int
+    :param act_layer: activation function
+    :type act_layer: Callable
+    :param drop: dropout rate
+    :type drop: float
+
    """

-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+    def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.

+    :param drop_prob: probability for dropout
+    :type drop_prob: float
+    :param training: whether it is training mode
+    :type training: bool
+
    """
    if drop_prob == 0. or not training:
        return x
@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
@LAYERS.register_module
 class VanillaViTDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+
+    :param drop_prob: probability for dropout
+    :type drop_path: float
    """

    def __init__(self, drop_prob=0.):
@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):

    :param dim: dimension of input tensor
    :type dim: int
-    :param num_heads: number of attention heads, defaults to 8
+    :param num_heads: number of attention heads
    :type num_heads: int, optional
    :param qkv_bias: enable bias for qkv if True, defaults to False
    :type qkv_bias: bool, optional
@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
    :type proj_drop: float, optional
    """

-    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+    def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
+    :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
+    :type parameters: (Iterable[Tensor] or Tensor)
+    :param max_norm: max norm of the gradients
+    :type max_norm: float or int
+    :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
+    :type norm_type: float or int 

-    Returns:
-        Total norm of the parameters (viewed as a single vector).
+    :return: Total norm of the parameters (viewed as a single vector).
+    :rtype: float
    """

    if isinstance(parameters, torch.Tensor):
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@ -123,12 +123,23 @@ def get_dataloader(dataset,
        stage and label on the last stage

    :param dataset: a :class:utils.data.dataset dataset
+    :param shuffle: whether to shuffle the dataset
    :param seed: random worker seed, defaults to 1024
-    :type seed: int, optional
-    :param add_sampler_if_possible: [description], defaults to False
-    :type add_sampler_if_possible: bool, optional
-    :return: a :class:utils.data.dataset dataloader
-    :rtype: torch.utils.data.dataset
+    :param add_sampler: add DistributedDataParallelSampelr to the dataset
+    :param drop_last: drop the last incomplete batch of data
+    :param pin_memory: whether to pin memory address in CPU memory
+    :param num_workers: number of worker threads for this dataloader
+
+    :type dataset: :class:`torch.utils.data.Dataset`
+    :type shuffle: bool, optional. Default is False
+    :type seed: int, optional. Default is 1024
+    :type add_sampler: bool, optional. Default is True
+    :type drop_last: bool, optional. Default is False
+    :type pin_memory: bool, optional. Default is False
+    :type num_workers: int, optional. Default is 0
+
+    :return: a object of :class:`torch.utils.data.DataLoader`
+    :rtype: :class:`torch.utils.data.DataLoader`
    '''
    _kwargs = kwargs.copy()

--- a/colossalai/utils/gradient_accumulation/init.py
+++ b/colossalai/utils/gradient_accumulation/init.py
@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
                        accumulate_size: int,
                        gradient_handlers: List[BaseGradientHandler] = None,
                        lr_scheduler: _LRScheduler = None):
+    """
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumulate_size: int
+    :param gradient_handlers: list of gradient handler objects. Default is None
+    :type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
+    :param lr_scheduler: your lr scheduler object. Default is None
+    :type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
+    """
    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler


 class GradAccumOptimizer(ColossalaiOptimizer):
+    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param optim: your optimizer object
+    :type optim: :class:`torch.optim.Optimizer`
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+    :param model: your model object to check if it is DDP for special handling of no_sync() context
+    :type model: :class:`torch.nn.Module`
+
+    """

    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
        super().__init__(optim)
@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):


 class GradAccumDataloader():
+    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
+
+    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
+    be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
+    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+    
+    :param dataloader: your dataloader object
+    :type dataloader: Iterable
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
        self.dataloader = dataloader
@ -99,6 +123,15 @@ class GradAccumDataloader():


 class GradAccumLrSchedulerByStep(_LRScheduler):
+    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param lr_scheduler: your lr scheduler object
+    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
        self.lr_scheduler = lr_scheduler
@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):


 class GradAccumGradientHandler():
+    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached
+
+    :param grad_handler: your gradient handler object
+    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
+    :param accumulate_size: the number of steps to accumulate gradients
+    :type accumualate_size: int
+
+    """

    def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
        assert isinstance(grad_handler, BaseGradientHandler), \
--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):

    :param message: a prefix message to add in the log
    :type message: str
+    :param logger: an instance of :class:`colossalai.logging.DistributedLogger`
+    :type logger: :class:`colossalai.logging.DistributedLogger`
+    :param report_cpu: whether to report CPU memory
+    :type report_cpu: bool
    :raises EnvironmentError: raise error if no distributed environment has been initialized
    '''
    if not gpc.is_initialized(ParallelMode.GLOBAL):
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@ -2,6 +2,13 @@


 class MultiTensorApply(object):
+    """
+    Apply an operation to a list of tensors efficiently
+
+    :param chunk_size: size of a chunk
+    :type chunk_size: int
+    """
+
    available = False
    warned = False

--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@ -74,6 +74,9 @@ class Timer:

 class MultiTimer:
    '''An object contains multiple timers
+
+    :param on: whether the timer is enabled. Default is True
+    :type on: bool
    '''

    def __init__(self, on: bool = True):
--- a/colossalai/zero/init.py
+++ b/colossalai/zero/init.py
@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
                    optimizer: Optimizer,
                    level: int,
                    zero_config):
+    """
+    A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
+
+    :param model: your model object
+    :type model: :class:`torch.nn.Module`
+    :param optimizer: your optimizer object
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param level: optimizer level, can be 2 or 3
+    :type level: int
+    :param zero_config: configuration for zero
+    :type zero_config: dict
+
+    :return: (model, optimizer)
+    :rtype: Tuple
+    """
    assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
    if level == 2:
        if is_no_pp_or_last_stage():
--- a/configs/resnet/resnet50.py
+++ b/configs/resnet/resnet50.py
@ -1,76 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import os
-
-IMG_SIZE = 224
-BATCH_SIZE = 256
-NUM_EPOCHS = 100
-
-model = dict(
-    type='VanillaResNet',
-    block_type='ResNetBottleneck',
-    layers=[3, 4, 6, 3],
-    num_cls=10
-)
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=os.environ['DATA'],
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=os.environ['DATA'],
-        train=False,
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-    )
-)
-
-parallelization = dict(
-    pipeline=1,
-    tensor=dict(size=1, mode=None),
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.01
-)
-
-loss = dict(
-    type='CrossEntropyLoss'
-)
-
-from colossalai.engine import AMP_TYPE
-
-fp16 = dict(
-    mode=AMP_TYPE.APEX,
-    opt_level='O2',
-)
--- a/configs/sample_config.py
+++ b/configs/sample_config.py
@ -1,22 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-NUM_EPOCH = int
-
-model = dict()
-train_data = dict()
-test_data = dict()
-optimizer = dict()
-loss = dict()
-
-fp16 = dict()
-zero = dict()
-
-gradient_handler = []
-parallel = dict()
-hooks = []
-
-cudnn_benchmark = True
-cudnn_deterministic = False
-
-logging = dict()
--- a/configs/vit/vit_2d.py
+++ b/configs/vit/vit_2d.py
@ -1,165 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from pathlib import Path
-
-BATCH_SIZE = 512
-IMG_SIZE = 32
-PATCH_SIZE = 4
-DIM = 512
-NUM_ATTENTION_HEADS = 2
-SUMMA_DIM = 2
-NUM_CLASSES = 10
-DEPTH = 6
-NUM_EPOCHS = 60
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        drop_last=True,
-        pin_memory=True,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        train=False,
-        transform_pipeline=[
-            dict(type='Resize', size=IMG_SIZE),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-    )
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.001,
-    weight_decay=0
-)
-
-loss = dict(
-    type='CrossEntropyLoss2D',
-)
-
-model = dict(
-    type='VisionTransformerFromConfig',
-    tensor_splitting_cfg=dict(
-        type='ViTInputSplitter2D',
-    ),
-    embedding_cfg=dict(
-        type='ViTPatchEmbedding2D',
-        img_size=IMG_SIZE,
-        patch_size=PATCH_SIZE,
-        embed_dim=DIM,
-    ),
-    token_fusion_cfg=dict(
-        type='ViTTokenFuser2D',
-        img_size=IMG_SIZE,
-        patch_size=PATCH_SIZE,
-        embed_dim=DIM,
-        drop_rate=0.1
-    ),
-    norm_cfg=dict(
-        type='LayerNorm2D',
-        normalized_shape=DIM,
-        eps=1e-6,
-    ),
-    block_cfg=dict(
-        type='ViTBlock',
-        attention_cfg=dict(
-            type='ViTSelfAttention2D',
-            hidden_size=DIM,
-            num_attention_heads=NUM_ATTENTION_HEADS,
-            attention_dropout_prob=0.,
-            hidden_dropout_prob=0.1,
-            checkpoint=True
-        ),
-        droppath_cfg=dict(
-            type='VanillaViTDropPath',
-        ),
-        mlp_cfg=dict(
-            type='ViTMLP2D',
-            in_features=DIM,
-            dropout_prob=0.1,
-            mlp_ratio=4,
-            checkpoint=True
-        ),
-        norm_cfg=dict(
-            type='LayerNorm2D',
-            normalized_shape=DIM,
-            eps=1e-6,
-        ),
-    ),
-    head_cfg=dict(
-        type='ViTHead2D',
-        hidden_size=DIM,
-        num_classes=NUM_CLASSES,
-    ),
-    embed_dim=DIM,
-    depth=DEPTH,
-    drop_path_rate=0.,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='Accuracy2DHook'),
-    dict(type='LossHook'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=5
-        )
-    ),
-    # dict(type='TensorboardHook', log_dir='./tb_logs'),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
-]
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=4, mode='2d'),
-)
-
-# for fp16 training
-# from colossalai.engine import AMP_TYPE
-# fp16 = dict(
-#     mode=AMP_TYPE.PARALLEL,
-#     initial_scale=2 ** 8
-# )
-
-# only needed when pipeline parallel is used
-# schedule = dict(
-#     num_microbatches=8
-# )
-
-
-logging = dict(
-    root_path='./logs'
-)
--- a/configs/vit/vit_3d.py
+++ b/configs/vit/vit_3d.py
@ -1,111 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-from pathlib import Path
-
-from colossalai.context import ParallelMode
-from colossalai.engine import AMP_TYPE
-
-try:
-    import model_zoo
-except:
-    print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
-
-BATCH_SIZE = 512
-IMG_SIZE = 32
-NUM_EPOCHS = 60
-
-train_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        transform_pipeline=[
-            dict(type='RandomCrop', size=IMG_SIZE, padding=4),
-            dict(type='RandomHorizontalFlip'),
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        num_workers=2,
-        shuffle=True,
-    )
-)
-
-test_data = dict(
-    dataset=dict(
-        type='CIFAR10Dataset',
-        root=Path(os.environ['DATA']),
-        train=False,
-        transform_pipeline=[
-            dict(type='ToTensor'),
-            dict(type='Normalize',
-                 mean=[0.4914, 0.4822, 0.4465],
-                 std=[0.2023, 0.1994, 0.2010]
-                 ),
-        ]
-    ),
-    dataloader=dict(
-        batch_size=BATCH_SIZE,
-        pin_memory=True,
-        num_workers=2,
-    )
-)
-
-optimizer = dict(
-    type='Adam',
-    lr=0.001
-)
-
-loss = dict(
-    type='CrossEntropyLoss3D',
-    input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
-    weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
-)
-
-model = dict(
-    type='vit_tiny_3d_patch4_32',
-    drop_rate=0.1,
-)
-
-hooks = [
-    dict(type='LogMetricByEpochHook'),
-    dict(type='LogTimingByEpochHook'),
-    dict(type='LogMemoryByEpochHook'),
-    dict(
-        type='Accuracy3DHook',
-        input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
-        weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
-    ),
-    dict(type='LossHook'),
-    dict(type='TensorboardHook', log_dir='./tfb_logs'),
-    dict(
-        type='LRSchedulerHook',
-        by_epoch=True,
-        lr_scheduler_cfg=dict(
-            type='LinearWarmupLR',
-            warmup_steps=5
-        )
-    ),
-    # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
-    # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
-]
-
-parallel = dict(
-    pipeline=dict(size=1),
-    tensor=dict(size=8, mode='3d'),
-)
-
-fp16 = dict(
-    mode=AMP_TYPE.PARALLEL,
-    initial_scale=2 ** 8
-)
-
-logging = dict(
-    root_path='./logs'
-)
--- a/docs/amp.md
+++ b/docs/amp.md
@ -77,10 +77,10 @@ fp16 = dict(
 )
 ```

-## Tensor Parallel AMP
+## Naive AMP

 We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor 
-and pipeline parallelism.
+and pipeline parallelism. This AMP mode will cast all operations into fp16.

 The following conde block show a config file for this mode.

--- a/docs/colossalai/colossalai.amp.apex_amp.rst
+++ b/docs/colossalai/colossalai.amp.apex_amp.rst
@ -0,0 +1,5 @@
+colossalai.amp.apex\_amp
+==========================
+
+.. automodule:: colossalai.amp.apex_amp
+   :members:
--- a/docs/colossalai/colossalai.amp.naive_amp.rst
+++ b/docs/colossalai/colossalai.amp.naive_amp.rst
@ -0,0 +1,5 @@
+colossalai.amp.naive\_amp
+==========================
+
+.. automodule:: colossalai.amp.naive_amp
+   :members:
--- a/docs/colossalai/colossalai.amp.rst
+++ b/docs/colossalai/colossalai.amp.rst
@ -0,0 +1,13 @@
+colossalai.amp
+==================
+
+.. toctree::
+   :maxdepth: 2
+
+   colossalai.amp.torch_amp
+   colossalai.amp.apex_amp
+   colossalai.amp.naive_amp
+
+
+.. automodule:: colossalai.amp
+   :members:
--- a/docs/colossalai/colossalai.amp.torch_amp.rst
+++ b/docs/colossalai/colossalai.amp.torch_amp.rst
@ -0,0 +1,5 @@
+colossalai.amp.torch\_amp
+==========================
+
+.. automodule:: colossalai.amp.torch_amp
+      :members:
--- a/docs/colossalai/colossalai.builder.rst
+++ b/docs/colossalai/colossalai.builder.rst
@ -1,12 +1,12 @@
 colossalai.builder
 ==================

-.. automodule:: colossalai.builder
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.builder.builder
   colossalai.builder.pipeline
+
+
+.. automodule:: colossalai.builder
+   :members:
--- a/docs/colossalai/colossalai.checkpointing.rst
+++ b/docs/colossalai/colossalai.checkpointing.rst
@ -1,5 +0,0 @@
-colossalai.checkpointing
-========================
-
-.. automodule:: colossalai.checkpointing
-   :members:
--- a/docs/colossalai/colossalai.communication.rst
+++ b/docs/colossalai/colossalai.communication.rst
@ -1,10 +1,6 @@
 colossalai.communication
 ========================

-.. automodule:: colossalai.communication
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

@ -12,3 +8,7 @@ colossalai.communication
   colossalai.communication.p2p
   colossalai.communication.ring
   colossalai.communication.utils
+
+
+.. automodule:: colossalai.communication
+   :members:
--- a/docs/colossalai/colossalai.context.random.rst
+++ b/docs/colossalai/colossalai.context.random.rst
@ -1,11 +1,11 @@
 colossalai.context.random
 =========================

-.. automodule:: colossalai.context.random
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.context.random.seed_manager
+
+
+.. automodule:: colossalai.context.random
+   :members:
--- a/docs/colossalai/colossalai.context.rst
+++ b/docs/colossalai/colossalai.context.rst
@ -1,9 +1,6 @@
 colossalai.context
 ==================

-.. automodule:: colossalai.context
-   :members:
-
 .. toctree::
   :maxdepth: 2

@ -17,3 +14,7 @@ colossalai.context
   colossalai.context.config
   colossalai.context.parallel_context
   colossalai.context.parallel_mode
+
+
+.. automodule:: colossalai.context
+   :members:
--- a/docs/colossalai/colossalai.engine.amp.amp_type.rst
+++ b/docs/colossalai/colossalai.engine.amp.amp_type.rst
@ -1,5 +0,0 @@
-colossalai.engine.amp.amp\_type
-===============================
-
-.. automodule:: colossalai.engine.amp.amp_type
-   :members:
--- a/docs/colossalai/colossalai.engine.amp.grad_scaler.rst
+++ b/docs/colossalai/colossalai.engine.amp.grad_scaler.rst
@ -1,5 +0,0 @@
-colossalai.engine.amp.grad\_scaler
-==================================
-
-.. automodule:: colossalai.engine.amp.grad_scaler
-   :members:
--- a/docs/colossalai/colossalai.engine.amp.rst
+++ b/docs/colossalai/colossalai.engine.amp.rst
@ -1,12 +0,0 @@
-colossalai.engine.amp
-=====================
-
-.. automodule:: colossalai.engine.amp
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.engine.amp.amp_type
-   colossalai.engine.amp.grad_scaler
--- a/docs/colossalai/colossalai.engine.rst
+++ b/docs/colossalai/colossalai.engine.rst
@ -1,12 +1,12 @@
 colossalai.engine
 =================

-.. automodule:: colossalai.engine
-   :members:
-
 .. toctree::
   :maxdepth: 2

-   colossalai.engine.amp
   colossalai.engine.gradient_handler
   colossalai.engine.schedule
+
+
+.. automodule:: colossalai.engine
+   :members:
--- a/docs/colossalai/colossalai.logging.rst
+++ b/docs/colossalai/colossalai.logging.rst
@ -1,11 +1,11 @@
 colossalai.logging
 ==================

-.. automodule:: colossalai.logging
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.logging.logging
+
+
+.. automodule:: colossalai.logging
+   :members:
--- a/docs/colossalai/colossalai.nn.data.base_dataset.rst
+++ b/docs/colossalai/colossalai.nn.data.base_dataset.rst
@ -1,5 +0,0 @@
-colossalai.nn.data.base\_dataset
-================================
-
-.. automodule:: colossalai.nn.data.base_dataset
-   :members:
--- a/docs/colossalai/colossalai.nn.data.caltech101_dataset.rst
+++ b/docs/colossalai/colossalai.nn.data.caltech101_dataset.rst
@ -1,5 +0,0 @@
-colossalai.nn.data.caltech101\_dataset
-======================================
-
-.. automodule:: colossalai.nn.data.caltech101_dataset
-   :members:
--- a/docs/colossalai/colossalai.nn.data.cifar10_dataset.rst
+++ b/docs/colossalai/colossalai.nn.data.cifar10_dataset.rst
@ -1,5 +0,0 @@
-colossalai.nn.data.cifar10\_dataset
-===================================
-
-.. automodule:: colossalai.nn.data.cifar10_dataset
-   :members:
--- a/docs/colossalai/colossalai.nn.data.rst
+++ b/docs/colossalai/colossalai.nn.data.rst
@ -1,18 +0,0 @@
-colossalai.nn.data
-==================
-
-.. automodule:: colossalai.nn.data
-   :members:
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.data.sampler
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.data.base_dataset
-   colossalai.nn.data.caltech101_dataset
-   colossalai.nn.data.cifar10_dataset
--- a/docs/colossalai/colossalai.nn.data.sampler.base_sampler.rst
+++ b/docs/colossalai/colossalai.nn.data.sampler.base_sampler.rst
@ -1,5 +0,0 @@
-colossalai.nn.data.sampler.base\_sampler
-========================================
-
-.. automodule:: colossalai.nn.data.sampler.base_sampler
-   :members:
--- a/docs/colossalai/colossalai.nn.data.sampler.data_parallel_sampler.rst
+++ b/docs/colossalai/colossalai.nn.data.sampler.data_parallel_sampler.rst
@ -1,5 +0,0 @@
-colossalai.nn.data.sampler.data\_parallel\_sampler
-==================================================
-
-.. automodule:: colossalai.nn.data.sampler.data_parallel_sampler
-   :members:
--- a/docs/colossalai/colossalai.nn.data.sampler.rst
+++ b/docs/colossalai/colossalai.nn.data.sampler.rst
@ -1,12 +0,0 @@
-colossalai.nn.data.sampler
-==========================
-
-.. automodule:: colossalai.nn.data.sampler
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.data.sampler.base_sampler
-   colossalai.nn.data.sampler.data_parallel_sampler
--- a/docs/colossalai/colossalai.nn.layer.non_parallel_layers.rst
+++ b/docs/colossalai/colossalai.nn.layer.non_parallel_layers.rst
@ -0,0 +1,5 @@
+colossalai.nn.layer.non\_parallel\_layers
+======================================
+
+.. automodule:: colossalai.nn.layer.non_parallel_layers
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_1d.rst
@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_1d
 ================================

-.. automodule:: colossalai.nn.layer.parallel_1d
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.nn.layer.parallel_1d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_1d
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_2d.rst
@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_2d
 ================================

-.. automodule:: colossalai.nn.layer.parallel_2d
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.nn.layer.parallel_2d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_2d
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_2p5d.rst
@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_2p5d
 ==================================

-.. automodule:: colossalai.nn.layer.parallel_2p5d
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.nn.layer.parallel_2p5d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_2p5d
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_3d.rst
@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_3d
 ================================

-.. automodule:: colossalai.nn.layer.parallel_3d
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.nn.layer.parallel_3d.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_3d
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_sequence.rst
@ -1,11 +1,11 @@
 colossalai.nn.layer.parallel\_sequence
 ======================================

-.. automodule:: colossalai.nn.layer.parallel_sequence
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.nn.layer.parallel_sequence.layers
+
+
+.. automodule:: colossalai.nn.layer.parallel_sequence
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.layers.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.layers.rst
@ -1,5 +0,0 @@
-colossalai.nn.layer.parallel\_vision\_transformer.layers
-========================================================
-
-.. automodule:: colossalai.nn.layer.parallel_vision_transformer.layers
-   :members:
--- a/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.rst
+++ b/docs/colossalai/colossalai.nn.layer.parallel_vision_transformer.rst
@ -1,11 +0,0 @@
-colossalai.nn.layer.parallel\_vision\_transformer
-=================================================
-
-.. automodule:: colossalai.nn.layer.parallel_vision_transformer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.parallel_vision_transformer.layers
--- a/docs/colossalai/colossalai.nn.layer.rst
+++ b/docs/colossalai/colossalai.nn.layer.rst
@ -1,9 +1,6 @@
 colossalai.nn.layer
 ===================

-.. automodule:: colossalai.nn.layer
-   :members:
-
 .. toctree::
   :maxdepth: 2

@ -12,13 +9,10 @@ colossalai.nn.layer
   colossalai.nn.layer.parallel_2p5d
   colossalai.nn.layer.parallel_3d
   colossalai.nn.layer.parallel_sequence
-   colossalai.nn.layer.parallel_vision_transformer
-   colossalai.nn.layer.vanilla_resnet
-   colossalai.nn.layer.vanilla_vision_transformer
+   colossalai.nn.layer.non_parallel_layers
   colossalai.nn.layer.wrapper
-
-
-.. toctree::
-   :maxdepth: 2
-
   colossalai.nn.layer.base_layer
+
+
+.. automodule:: colossalai.nn.layer
+   :members:
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.basic_block.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.basic_block.rst
@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.basic\_block
-================================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.basic_block
-   :members:
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.bottleneck.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.bottleneck.rst
@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.bottleneck
-==============================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.bottleneck
-   :members:
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.conv.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.conv.rst
@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.conv
-========================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.conv
-   :members:
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.reslayer.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.reslayer.rst
@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet.reslayer
-============================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet.reslayer
-   :members:
--- a/docs/colossalai/colossalai.nn.layer.vanilla_resnet.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_resnet.rst
@ -1,14 +0,0 @@
-colossalai.nn.layer.vanilla\_resnet
-===================================
-
-.. automodule:: colossalai.nn.layer.vanilla_resnet
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.vanilla_resnet.basic_block
-   colossalai.nn.layer.vanilla_resnet.bottleneck
-   colossalai.nn.layer.vanilla_resnet.conv
-   colossalai.nn.layer.vanilla_resnet.reslayer
--- a/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.layers.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.layers.rst
@ -1,5 +0,0 @@
-colossalai.nn.layer.vanilla\_vision\_transformer.layers
-=======================================================
-
-.. automodule:: colossalai.nn.layer.vanilla_vision_transformer.layers
-   :members:
--- a/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.rst
+++ b/docs/colossalai/colossalai.nn.layer.vanilla_vision_transformer.rst
@ -1,11 +0,0 @@
-colossalai.nn.layer.vanilla\_vision\_transformer
-================================================
-
-.. automodule:: colossalai.nn.layer.vanilla_vision_transformer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.layer.vanilla_vision_transformer.layers
--- a/docs/colossalai/colossalai.nn.loss.base_loss.rst
+++ b/docs/colossalai/colossalai.nn.loss.base_loss.rst
@ -1,5 +0,0 @@
-colossalai.nn.loss.base\_loss
-=============================
-
-.. automodule:: colossalai.nn.loss.base_loss
-   :members:
--- a/docs/colossalai/colossalai.nn.loss.cross_entropy_1d.rst
+++ b/docs/colossalai/colossalai.nn.loss.cross_entropy_1d.rst
@ -1,5 +0,0 @@
-colossalai.nn.loss.cross\_entropy\_1d
-=====================================
-
-.. automodule:: colossalai.nn.loss.cross_entropy_1d
-   :members:
--- a/docs/colossalai/colossalai.nn.loss.rst
+++ b/docs/colossalai/colossalai.nn.loss.rst
@ -1,15 +1,13 @@
 colossalai.nn.loss
 ==================

-.. automodule:: colossalai.nn.loss
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

-   colossalai.nn.loss.base_loss
-   colossalai.nn.loss.cross_entropy_1d
   colossalai.nn.loss.cross_entropy_2d
   colossalai.nn.loss.cross_entropy_2p5d
   colossalai.nn.loss.cross_entropy_3d
+
+
+.. automodule:: colossalai.nn.loss
+   :members:
--- a/docs/colossalai/colossalai.nn.lr_scheduler.rst
+++ b/docs/colossalai/colossalai.nn.lr_scheduler.rst
@ -1,10 +1,6 @@
 colossalai.nn.lr\_scheduler
 ===========================

-.. automodule:: colossalai.nn.lr_scheduler
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

@ -15,3 +11,7 @@ colossalai.nn.lr\_scheduler
   colossalai.nn.lr_scheduler.onecycle
   colossalai.nn.lr_scheduler.poly
   colossalai.nn.lr_scheduler.torch
+
+   
+.. automodule:: colossalai.nn.lr_scheduler
+   :members:
--- a/docs/colossalai/colossalai.nn.model.base_model.rst
+++ b/docs/colossalai/colossalai.nn.model.base_model.rst
@ -1,5 +0,0 @@
-colossalai.nn.model.base\_model
-===============================
-
-.. automodule:: colossalai.nn.model.base_model
-   :members:
--- a/docs/colossalai/colossalai.nn.model.model_from_config.rst
+++ b/docs/colossalai/colossalai.nn.model.model_from_config.rst
@ -0,0 +1,5 @@
+colossalai.nn.model.model\_from\_config
+===============================
+
+.. automodule:: colossalai.nn.model.model_from_config
+   :members:
--- a/docs/colossalai/colossalai.nn.model.rst
+++ b/docs/colossalai/colossalai.nn.model.rst
@ -1,17 +1,7 @@
 colossalai.nn.model
 ===================

-.. automodule:: colossalai.nn.model
-   :members:
-
 .. toctree::
   :maxdepth: 2

-   colossalai.nn.model.vanilla_resnet
-   colossalai.nn.model.vision_transformer
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.model.base_model
+   colossalai.nn.model.model_from_config
--- a/docs/colossalai/colossalai.nn.model.vanilla_resnet.resnet.rst
+++ b/docs/colossalai/colossalai.nn.model.vanilla_resnet.resnet.rst
@ -1,5 +0,0 @@
-colossalai.nn.model.vanilla\_resnet.resnet
-==========================================
-
-.. automodule:: colossalai.nn.model.vanilla_resnet.resnet
-   :members:
--- a/docs/colossalai/colossalai.nn.model.vanilla_resnet.rst
+++ b/docs/colossalai/colossalai.nn.model.vanilla_resnet.rst
@ -1,11 +0,0 @@
-colossalai.nn.model.vanilla\_resnet
-===================================
-
-.. automodule:: colossalai.nn.model.vanilla_resnet
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.model.vanilla_resnet.resnet
--- a/docs/colossalai/colossalai.nn.model.vision_transformer.rst
+++ b/docs/colossalai/colossalai.nn.model.vision_transformer.rst
@ -1,11 +0,0 @@
-colossalai.nn.model.vision\_transformer
-=======================================
-
-.. automodule:: colossalai.nn.model.vision_transformer
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.model.vision_transformer.vision_transformer
--- a/docs/colossalai/colossalai.nn.model.vision_transformer.vision_transformer.rst
+++ b/docs/colossalai/colossalai.nn.model.vision_transformer.vision_transformer.rst
@ -1,5 +0,0 @@
-colossalai.nn.model.vision\_transformer.vision\_transformer
-===========================================================
-
-.. automodule:: colossalai.nn.model.vision_transformer.vision_transformer
-   :members:
--- a/docs/colossalai/colossalai.nn.multi_tensor_apply.multi_tensor_apply.rst
+++ b/docs/colossalai/colossalai.nn.multi_tensor_apply.multi_tensor_apply.rst
@ -1,5 +0,0 @@
-colossalai.nn.multi\_tensor\_apply.multi\_tensor\_apply
-=======================================================
-
-.. automodule:: colossalai.nn.multi_tensor_apply.multi_tensor_apply
-   :members:
--- a/docs/colossalai/colossalai.nn.multi_tensor_apply.rst
+++ b/docs/colossalai/colossalai.nn.multi_tensor_apply.rst
@ -1,11 +0,0 @@
-colossalai.nn.multi\_tensor\_apply
-==================================
-
-.. automodule:: colossalai.nn.multi_tensor_apply
-   :members:
-
-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.nn.multi_tensor_apply.multi_tensor_apply
--- a/docs/colossalai/colossalai.nn.optimizer.fp16_optimizer.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.fp16_optimizer.rst
@ -1,5 +0,0 @@
-colossalai.nn.optimizer.fp16\_optimizer
-=======================================
-
-.. automodule:: colossalai.nn.optimizer.fp16_optimizer
-   :members:
--- a/docs/colossalai/colossalai.nn.optimizer.loss_scaler.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.loss_scaler.rst
@ -1,5 +0,0 @@
-colossalai.nn.optimizer.loss\_scaler
-====================================
-
-.. automodule:: colossalai.nn.optimizer.loss_scaler
-   :members:
--- a/docs/colossalai/colossalai.nn.optimizer.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.rst
@ -1,20 +1,15 @@
 colossalai.nn.optimizer
 =======================

-.. automodule:: colossalai.nn.optimizer
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

-   colossalai.nn.optimizer.fp16_optimizer
   colossalai.nn.optimizer.fused_adam
   colossalai.nn.optimizer.fused_lamb
   colossalai.nn.optimizer.fused_sgd
   colossalai.nn.optimizer.lamb
   colossalai.nn.optimizer.lars
-   colossalai.nn.optimizer.loss_scaler
-   colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
-   colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
-   colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
+
+
+.. automodule:: colossalai.nn.optimizer
+   :members:
--- a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_1.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_1.rst
@ -1,5 +0,0 @@
-colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_1
-=============================================================
-
-.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
-   :members:
--- a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_2.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_2.rst
@ -1,5 +0,0 @@
-colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_2
-=============================================================
-
-.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
-   :members:
--- a/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_3.rst
+++ b/docs/colossalai/colossalai.nn.optimizer.zero_redundancy_optimizer_level_3.rst
@ -1,5 +0,0 @@
-colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_3
-=============================================================
-
-.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
-   :members:
--- a/docs/colossalai/colossalai.nn.rst
+++ b/docs/colossalai/colossalai.nn.rst
@ -1,16 +1,15 @@
 colossalai.nn
 =============

-.. automodule:: colossalai.nn
-   :members:
-
 .. toctree::
   :maxdepth: 2

-   colossalai.nn.data
   colossalai.nn.layer
   colossalai.nn.loss
   colossalai.nn.lr_scheduler
   colossalai.nn.model
-   colossalai.nn.multi_tensor_apply
   colossalai.nn.optimizer
+
+
+.. automodule:: colossalai.nn
+   :members:
--- a/docs/colossalai/colossalai.registry.rst
+++ b/docs/colossalai/colossalai.registry.rst
@ -1,11 +1,11 @@
 colossalai.registry
 ===================

-.. automodule:: colossalai.registry
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

   colossalai.registry.registry
+
+
+.. automodule:: colossalai.registry
+   :members:
--- a/docs/colossalai/colossalai.rst
+++ b/docs/colossalai/colossalai.rst
@ -1,12 +1,18 @@
 colossalai
 ==========

-.. automodule:: colossalai
-   :members:
-
 .. toctree::
   :maxdepth: 2

+   colossalai.constants
+   colossalai.core
+   colossalai.initialize
+
+
+.. toctree::
+   :maxdepth: 2
+
+   colossalai.amp
   colossalai.builder
   colossalai.communication
   colossalai.context
@ -16,11 +22,7 @@ colossalai
   colossalai.registry
   colossalai.trainer
   colossalai.utils
+   colossalai.zero

-
-.. toctree::
-   :maxdepth: 2
-
-   colossalai.constants
-   colossalai.core
-   colossalai.initialize
+.. automodule:: colossalai
+   :members:
--- a/docs/colossalai/colossalai.trainer.rst
+++ b/docs/colossalai/colossalai.trainer.rst
@ -1,9 +1,6 @@
 colossalai.trainer
 ==================

-.. automodule:: colossalai.trainer
-   :members:
-
 .. toctree::
   :maxdepth: 2

@ -14,3 +11,7 @@ colossalai.trainer
   :maxdepth: 2

   colossalai.trainer.metric
+
+
+.. automodule:: colossalai.trainer
+   :members:
--- a/docs/colossalai/colossalai.utils.data_sampler.rst
+++ b/docs/colossalai/colossalai.utils.data_sampler.rst
@ -0,0 +1,5 @@
+colossalai.utils.data\_sampler
+=======================================
+
+.. automodule:: colossalai.utils.data_sampler
+   :members:
--- a/docs/colossalai/colossalai.utils.gradient_accumulation.rst
+++ b/docs/colossalai/colossalai.utils.gradient_accumulation.rst
@ -0,0 +1,5 @@
+colossalai.utils.gradient\_accumulation
+=======================================
+
+.. automodule:: colossalai.utils.gradient_accumulation
+   :members:
--- a/docs/colossalai/colossalai.utils.multi_tensor_apply.rst
+++ b/docs/colossalai/colossalai.utils.multi_tensor_apply.rst
@ -0,0 +1,8 @@
+colossalai.nn.multi\_tensor\_apply
+==================================
+
+.. automodule:: colossalai.utils.multi_tensor_apply.multi_tensor_apply
+   :members:
+
+
+
--- a/docs/colossalai/colossalai.utils.rst
+++ b/docs/colossalai/colossalai.utils.rst
@ -1,10 +1,6 @@
 colossalai.utils
 ================

-.. automodule:: colossalai.utils
-   :members:
-
-
 .. toctree::
   :maxdepth: 2

@ -12,5 +8,12 @@ colossalai.utils
   colossalai.utils.checkpointing
   colossalai.utils.common
   colossalai.utils.cuda
+   colossalai.utils.data_sampler
+   colossalai.utils.gradient_accumulation
   colossalai.utils.memory
+   colossalai.utils.multi_tensor_apply
   colossalai.utils.timer
+
+
+.. automodule:: colossalai.utils
+   :members:
--- a/docs/colossalai/colossalai.zero.rst
+++ b/docs/colossalai/colossalai.zero.rst
@ -0,0 +1,5 @@
+colossalai.zero
+================
+
+.. automodule:: colossalai.zero
+   :members:
--- a/Show More
+++ b/Show More