Fixed docstring in colossalai (#171)

2022-01-21 10:44:30 +08:00 · 2022-01-21 10:44:30 +08:00 · 0f8c7f9804
parent e2089c5c15
commit 0f8c7f9804
77 changed files with 983 additions and 603 deletions
--- a/colossalai/amp/apex_amp/apex_amp.py
+++ b/colossalai/amp/apex_amp/apex_amp.py
@ -13,23 +13,25 @@ from colossalai.utils import clip_grad_norm_fp32


 class ApexAMPOptimizer(ColossalaiOptimizer):
-    ''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
+    """ A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
    methods
-    '''
+    """

    def backward(self, loss: Tensor):
-        """
-        :param loss: loss computed by a loss function
+        """Backward pass to get all gradients
+
+        :param loss: Loss computed by a loss function
        :type loss: torch.Tensor
        """
        with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
            scaled_loss.backward()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        """
-        :param model: your model object
+        """Clip gradients' norm
+
+        :param model: Your model object
        :type model: torch.nn.Module
-        :param max_norm: the max norm value for gradient clipping
+        :param max_norm: The max norm value for gradient clipping
        :type max_norm: float
        """
        if max_norm > 0:
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
@ -15,7 +15,10 @@ from ._fp16_optimizer import FP16Optimizer
 class NaiveAMPOptimizer(ColossalaiOptimizer):
    """A wrapper class for optimizer to cast all parameters to fp16

-    :param optim: a normal optimizer like Adam or SGD
+    :param optim: A normal optimizer like Adam or SGD
+    :param args: Args used to initialize FP16 optimizer
+    :param kwargs: Kwargs used to initialize FP16 optimizer
+
    :type optim: torch.optim.Optimizer
    """

@ -24,7 +27,8 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
        super().__init__(optim)

    def backward(self, loss: Tensor):
-        """backward with gradient scaler
+        """Backward with gradient scaler
+
        :param loss: loss computed by a loss function
        :type loss: torch.Tensor
        """
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@ -16,19 +16,11 @@ from colossalai.utils import clip_grad_norm_fp32
 class TorchAMPOptimizer(ColossalaiOptimizer):
    """A wrapper class which integrate pytorch amp with an optimizer

-    :param optim: a normal optimizer like Adam or SGD
-    :type optim: torch.optim.Optimizer
-    :param init_scale: Initial scale factor
-    :type init_scale: float, optional, default=2.**16
-    :param growth_factor: Factor by which the scale is multiplied during :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
-    :type growth_factor: float, optional, default=2.0
-    :param backoff_factor: Factor by which the scale is multiplied during :meth:`update` if inf/NaN gradients occur in an iteration.
-    :type backoff_factor: float, optional, default=0.5
-    :param growth_interval: Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by ``growth_factor``.
-    :type growth_interval: int, optional, default=2000
-    :param enabled: If ``False``, disables gradient scaling. :meth:`step` simply invokes the underlying ``optimizer.step()``, and other methods become no-ops.
-    :type enabled: bool, optional, default=True
+    :param optim: A normal optimizer like Adam or SGD
+    :param args: Args used to initialize gradient scaler
+    :param kwargs: Kwargs used to initialize gradient scaler

+    :type optim: torch.optim.Optimizer
    """

    def __init__(self, optim: Optimizer, *args, **kwargs):
@ -36,23 +28,25 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
        self.scaler = GradScaler(*args, **kwargs)

    def backward(self, loss: Tensor):
-        """backward with torch amp gradient scaler
-        :param loss: loss computed by a loss function
+        """Backward with torch amp gradient scaler
+
+        :param loss: Loss computed by a loss function
        :type loss: torch.Tensor
        """
        self.scaler.scale(loss).backward()

    def step(self):
-        """update the parameters of the model
+        """Update the parameters of the model
        """
        self.scaler.step(self.optim)
        self.scaler.update()

    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        """apply gradient clipping to the model parameters
-        :param model: your model object
+        """Apply gradient clipping to the model parameters
+
+        :param model: Your model object
        :type model: torch.nn.Module
-        :param max_norm: max norm value for gradient clipping
+        :param max_norm: Max norm value for gradient clipping
        :type max_norm: float
        """
        if max_norm > 0.0:
@ -76,7 +70,8 @@ class TorchAMPModel(nn.Module):

 class TorchAMPLoss(nn.Module):
    """A wrapper class for a criterion object which computes the loss in mixed-precision context
-    :param loss: a loss function object
+
+    :param loss: A loss function object
    :type loss: torch.nn.modules.loss._Loss
    """

--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@ -176,16 +176,16 @@ def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method:
        ...
    )

-    :param config: configuration of the model
+    :param config: Configuration of the model
    :type config: dict
-    :param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
+    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
                        in most cases unless you are using virutal pipeline parallelism.
-    :type num_chunks: int
-    :param partition_method: this parameter determines how you want to split your model layers into stages,
+    :type num_chunks: int, optional
+    :param partition_method: This parameter determines how you want to split your model layers into stages,
                                you can set it as 'layer' or 'parameter'
-    :type partition_method: str
-    :param verbose: whether to print the logs
-    :type verbose: bool
+    :type partition_method: str, optional
+    :param verbose: Whether to print the logs
+    :type verbose: bool, optional
    """
    ori_model = build_model(config)
    layers = ori_model.layers_cfg
@ -240,13 +240,13 @@ def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bo
    """An intializer to split the model into different stages for pipeline parallelism.
    Note that `layer` must be `torch.nn.Sequential`.

-    :param layers: layers of model
-    :type config: `torch.nn.Sequential`
-    :param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
+    :param layers: Layers of model
+    :type layers: `torch.nn.Sequential`
+    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
                        in most cases unless you are using virutal pipeline parallelism.
-    :type num_chunks: int
-    :param verbose: whether to print the logs
-    :type verbose: bool
+    :type num_chunks: int, optional
+    :param verbose: Whether to print the logs
+    :type verbose: bool, optional
    """
    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
@ -18,9 +18,13 @@ def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op:
    :param tensor: Tensor to be gathered
    :param dim: The dimension concatenating in
    :param parallel_mode: Parallel group mode used in this communication
+    :param async_op: Whether operations are asynchronous
+
    :type tensor: :class:`torch.Tensor`
    :type dim: int
    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+    :type async_op: bool, optional
+
    :return: The tensor generated by all-gather
    :rtype: :class:`torch.Tensor`
    """
@ -56,9 +60,15 @@ def reduce_scatter(tensor: Tensor,
    :param tensor: Tensor to be reduced and scattered
    :param dim: The dimension scattering in
    :param parallel_mode: Parallel group mode used in this communication
+    :param op: The type of reduce operation
+    :param async_op: Whether operations are asynchronous
+
    :type tensor: :class:`torch.Tensor`
    :type dim: int
    :type parallel_mode: :class:`colossalai.context.ParallelMode`
+    :type op: ReduceOp, optional
+    :type async_op: bool, optional
+
    :return: The tensor generated by reduce-scatter
    :rtype: :class:`Tensor`
    """
--- a/colossalai/communication/utils.py
+++ b/colossalai/communication/utils.py
@ -65,7 +65,17 @@ def recv_tensor_meta(tensor_shape, prev_rank=None):


 def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
-    """Break a tensor into equal 1D chunks."""
+    """Break a tensor into equal 1D chunks.
+
+    :param tensor: Tensor to be splitted before communication
+    :param new_buffer: Whether uses a new buffer to store sliced tensor
+
+    :type tensor: torch.Tensor
+    :type new_buffer: bool, optional
+
+    :return splitted_tensor: The splitted tensor
+    :rtype splitted_tensor: torch.Tensor
+    """
    partition_size = torch.numel(tensor) // gpc.get_world_size(ParallelMode.PARALLEL_1D)
    start_index = partition_size * gpc.get_local_rank(ParallelMode.PARALLEL_1D)
    end_index = start_index + partition_size
@ -80,7 +90,14 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):


 def gather_split_1d_tensor(tensor):
-    """Opposite of above function, gather values from model parallel ranks."""
+    """Opposite of above function, gather values from model parallel ranks.
+
+    :param tensor: Tensor to be gathered after communication
+    :type tensor: torch.Tensor
+
+    :return gathered: The gathered tensor
+    :rtype gathered: torch.Tensor
+    """
    world_size = gpc.get_world_size(ParallelMode.PARALLEL_1D)
    numel = torch.numel(tensor)
    numel_gathered = world_size * numel
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@ -307,6 +307,7 @@ class ParallelContext:
                         port: int
                         ):
        """Initializes the global distributed environment
+
        :param rank: rank for the default process group
        :type rank: int
        :param world_size: world size of the default process group
@ -462,7 +463,7 @@ class ParallelContext:
        """Sets distributed processes to be bound to devices.

        :param device_ordinal: the device id to be bound to
-        :type device_ordinal: int
+        :type device_ordinal: int, optional
        """
        global_rank = self.get_global_rank()
        if device_ordinal is None:
--- a/colossalai/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/context/process_group_initializer/initializer_1d.py
@ -12,19 +12,22 @@ from colossalai.constants import PARALLEL_INPUT_1D

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_1D(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for 1d tensor parallelism.
-    '''
+    """A ProcessGroupInitializer for 1d tensor parallelism.
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_group = self.world_size // self.tensor_parallel_size

    def init_dist_group(self):
-        '''Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.

        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        :rtype: tuple
-        '''
+        :rtype: Tuple
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/context/process_group_initializer/initializer_2d.py
@ -22,8 +22,16 @@ def _check_summa_env_var(summa_dim):


 class Initializer_2D_Row(ProcessGroupInitializer):
-    '''2d tensor parallel initialization among rows. 
-    '''
+    """2d tensor parallel initialization among rows.
+
+    :param num_group: The number of all tensor groups
+    :param summa_dim: The dimension of SUMMA
+    :param args: Args used to initialize base class
+    :param kwargs: Kwargs used to initialize base class
+
+    :type num_group: int
+    :type summa_dim: int
+    """

    def __init__(self, num_group, summa_dim, *args, **kwargs):
        super(Initializer_2D_Row, self).__init__(*args, **kwargs)
@ -31,11 +39,11 @@ class Initializer_2D_Row(ProcessGroupInitializer):
        self.summa_dim = summa_dim

    def init_dist_group(self):
-        '''Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2D tensor row parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 2D tensor row parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -58,8 +66,16 @@ class Initializer_2D_Row(ProcessGroupInitializer):


 class Initializer_2D_Col(ProcessGroupInitializer):
-    '''2d tensor parallel initialization among cols. 
-    '''
+    """2d tensor parallel initialization among cols.
+
+    :param num_group: The number of all tensor groups
+    :param summa_dim: The dimension of SUMMA
+    :param args: Args used to initialize base class
+    :param kwargs: Kwargs used to initialize base class
+
+    :type num_group: int
+    :type summa_dim: int
+    """

    def __init__(self, num_group, summa_dim, *args, **kwargs):
        super(Initializer_2D_Col, self).__init__(*args, **kwargs)
@ -67,11 +83,11 @@ class Initializer_2D_Col(ProcessGroupInitializer):
        self.summa_dim = summa_dim

    def init_dist_group(self):
-        '''Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2D tensor col parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 2D tensor col parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -97,6 +113,9 @@ class Initializer_2D_Col(ProcessGroupInitializer):
 class Initializer_2D(ProcessGroupInitializer):
    """
    Serve as the single entry point to 2D parallel initialization.
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
    """

    def __init__(self, *args, **kwargs):
@ -112,12 +131,10 @@ class Initializer_2D(ProcessGroupInitializer):
        self.row_initializer = Initializer_2D_Row(self.num_group, self.summa_dim, *args, **kwargs)

    def init_dist_group(self):
-        '''Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2D tensor parallelism's information 
-        :rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
-        parallel_setting = []
-        parallel_setting.append(self.row_initializer.init_dist_group())
-        parallel_setting.append(self.col_initializer.init_dist_group())
+        :return: 2D tensor parallelism's information
+        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
+        parallel_setting = [self.row_initializer.init_dist_group(), self.col_initializer.init_dist_group()]
        return parallel_setting
--- a/colossalai/context/process_group_initializer/initializer_2p5d.py
+++ b/colossalai/context/process_group_initializer/initializer_2p5d.py
@ -33,8 +33,15 @@ def _check_tesseract_env_var(tesseract_dim: int,

 # i row j col k dep
 class Initializer_2p5D_ROW(ProcessGroupInitializer):
-    '''2p5d tensor parallel initialization among rows. 
-    '''
+    """2p5d tensor parallel initialization among rows.
+
+    :param tesseract_dim: The dimension of tesseract
+    :param tesseract_dep: The dimension of depth
+    :param args: Args used to initialize base class
+
+    :type tesseract_dim: int
+    :type tesseract_dep: int
+    """

    def __init__(self,
                 tesseract_dim: int,
@ -48,11 +55,11 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        '''Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor row parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 2p5D tensor row parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -76,8 +83,15 @@ class Initializer_2p5D_ROW(ProcessGroupInitializer):


 class Initializer_2p5D_Col(ProcessGroupInitializer):
-    '''2p5d tensor parallel initialization among cols. 
-    '''
+    """2p5d tensor parallel initialization among cols.
+
+    :param tesseract_dim: The dimension of tesseract
+    :param tesseract_dep: The dimension of depth
+    :param args: Args used to initialize base class
+
+    :type tesseract_dim: int
+    :type tesseract_dep: int
+    """

    def __init__(self,
                 tesseract_dim: int,
@ -91,11 +105,11 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        '''Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor col parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 2p5D tensor col parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -119,8 +133,15 @@ class Initializer_2p5D_Col(ProcessGroupInitializer):


 class Initializer_2p5D_Dep(ProcessGroupInitializer):
-    '''2p5D tensor parallel initialization among depths. 
-    '''
+    """2p5D tensor parallel initialization among depths.
+
+    :param tesseract_dim: The dimension of tesseract
+    :param tesseract_dep: The dimension of depth
+    :param args: Args used to initialize base class
+
+    :type tesseract_dim: int
+    :type tesseract_dep: int
+    """

    def __init__(self,
                 tesseract_dim: int,
@ -134,11 +155,11 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        '''Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor depth parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 2p5D tensor depth parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -163,8 +184,15 @@ class Initializer_2p5D_Dep(ProcessGroupInitializer):

 # i row j col k dep
 class Initializer_2p5D_XZ(ProcessGroupInitializer):
-    '''2p5d tensor parallel initialization among cols times dep. 
-    '''
+    """2p5d tensor parallel initialization among cols times dep.
+
+    :param tesseract_dim: The dimension of tesseract
+    :param tesseract_dep: The dimension of depth
+    :param args: Args used to initialize base class
+
+    :type tesseract_dim: int
+    :type tesseract_dep: int
+    """

    def __init__(self,
                 tesseract_dim: int,
@ -178,11 +206,11 @@ class Initializer_2p5D_XZ(ProcessGroupInitializer):
            "Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"

    def init_dist_group(self):
-        '''Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 2p5D tensor colXdepth parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 2p5D tensor colXdepth parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -209,6 +237,22 @@ class Initializer_2p5D_XZ(ProcessGroupInitializer):
 class Initializer_2p5D(ProcessGroupInitializer):
    """
    Serve as the single entry point to Tesseract parallel initialization.
+
+    :param rank: The rank of current process
+    :param world_size: Size of whole communication world
+    :param config: Running configuration
+    :param data_parallel_size: Size of data parallel
+    :param pipeline_parallel_size: Size of pipeline parallel
+    :param tensor_parallel_size: Size of tensor parallel
+    :param depth: The depth of 2p5d parallel
+
+    :type rank: int
+    :type world_size: int
+    :type config: Config
+    :type data_parallel_size: int
+    :type pipeline_parallel_size: int
+    :type tensor_parallel_size: int
+    :type depth: int
    """

    def __init__(self,
@ -216,11 +260,11 @@ class Initializer_2p5D(ProcessGroupInitializer):
                 world_size: int,
                 config: Config,
                 data_parallel_size: int,
-                 pipeline_parlalel_size: int,
+                 pipeline_parallel_size: int,
                 tensor_parallel_size: int,
                 depth: int
                 ):
-        args = (rank, world_size, config, data_parallel_size, pipeline_parlalel_size, tensor_parallel_size)
+        args = (rank, world_size, config, data_parallel_size, pipeline_parallel_size, tensor_parallel_size)
        super().__init__(*args)
        self.num_group = self.world_size // self.tensor_parallel_size
        self.tesseract_dim = int(math.sqrt(self.tensor_parallel_size / depth))
@ -236,14 +280,11 @@ class Initializer_2p5D(ProcessGroupInitializer):
        self.xz_initializer = Initializer_2p5D_XZ(self.tesseract_dim, self.tesseract_dep, *args)

    def init_dist_group(self):
-        '''Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.

        :return: Whole 2p5D tensor parallelism's information
-        :rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
-        parallel_setting = []
-        parallel_setting.append(self.col_initializer.init_dist_group())
-        parallel_setting.append(self.row_initializer.init_dist_group())
-        parallel_setting.append(self.dep_initializer.init_dist_group())
-        parallel_setting.append(self.xz_initializer.init_dist_group())
+        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
+        parallel_setting = [self.col_initializer.init_dist_group(), self.row_initializer.init_dist_group(),
+                            self.dep_initializer.init_dist_group(), self.xz_initializer.init_dist_group()]
        return parallel_setting
--- a/colossalai/context/process_group_initializer/initializer_3d.py
+++ b/colossalai/context/process_group_initializer/initializer_3d.py
@ -25,19 +25,26 @@ def _check_depth_env_var(depth):


 class Initializer_3D_Input(ProcessGroupInitializer):
-    '''2D tensor parallel initialization among input. 
-    '''
+    """3D tensor parallel initialization among input.
+
+    :param num_group: The number of all tensor groups
+    :param depth: Depth of 3D parallelism
+    :param args: Args used in base class
+
+    :type num_group: int
+    :type depth: int
+    """
    def __init__(self, num_group: int, depth: int, *args):
        super().__init__(*args)
        self.num_group = num_group
        self.depth = depth

    def init_dist_group(self):
-        '''Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.
+        """Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information among input 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 3D tensor parallelism's information among input
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -64,8 +71,15 @@ class Initializer_3D_Input(ProcessGroupInitializer):


 class Initializer_3D_Weight(ProcessGroupInitializer):
-    '''3D tensor parallel initialization among weight. 
-    '''
+    """3D tensor parallel initialization among weight.
+
+    :param num_group: The number of all tensor groups
+    :param depth: Depth of 3D parallelism
+    :param args: Args used in base class
+
+    :type num_group: int
+    :type depth: int
+    """

    def __init__(self, num_group: int, depth: int, *args):
        super().__init__(*args)
@ -73,11 +87,11 @@ class Initializer_3D_Weight(ProcessGroupInitializer):
        self.depth = depth

    def init_dist_group(self):
-        '''Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.
+        """Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information among weight 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 3D tensor parallelism's information among weight
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -104,8 +118,15 @@ class Initializer_3D_Weight(ProcessGroupInitializer):


 class Initializer_3D_Output(ProcessGroupInitializer):
-    '''2D tensor parallel initialization among weight. 
-    '''
+    """3D tensor parallel initialization among weight.
+
+    :param num_group: The number of all tensor groups
+    :param depth: Depth of 3D parallelism
+    :param args: Args used in base class
+
+    :type num_group: int
+    :type depth: int
+    """

    def __init__(self, num_group: int, depth: int, *args):
        super().__init__(*args)
@ -113,11 +134,11 @@ class Initializer_3D_Output(ProcessGroupInitializer):
        self.depth = depth

    def init_dist_group(self):
-        '''Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.
+        """Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information among output 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: 3D tensor parallelism's information among output
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -145,8 +166,10 @@ class Initializer_3D_Output(ProcessGroupInitializer):

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_3D(ProcessGroupInitializer):
-    '''Serve as the single entry point to 3D parallel initialization.
-    '''
+    """Serve as the single entry point to 3D parallel initialization.
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    """
    def __init__(self, *args):
        super().__init__(*args)
        self.num_group = self.world_size // self.tensor_parallel_size
@ -163,13 +186,11 @@ class Initializer_3D(ProcessGroupInitializer):
            self.num_group, self.depth, *args)

    def init_dist_group(self):
-        '''Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.

-        :return: 3D tensor parallelism's information 
-        :rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
-        parallel_setting = []
-        parallel_setting.append(self.input_initializer.init_dist_group())
-        parallel_setting.append(self.weight_initializer.init_dist_group())
-        parallel_setting.append(self.output_initializer.init_dist_group())
+        :return: 3D tensor parallelism's information
+        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
+        parallel_setting = [self.input_initializer.init_dist_group(), self.weight_initializer.init_dist_group(),
+                            self.output_initializer.init_dist_group()]
        return parallel_setting
--- a/colossalai/context/process_group_initializer/initializer_data.py
+++ b/colossalai/context/process_group_initializer/initializer_data.py
@ -10,18 +10,21 @@ from ..parallel_mode import ParallelMode

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Data(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for data parallelism.
-    '''
+    """A ProcessGroupInitializer for data parallelism.
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_data_parallel_group = self.world_size // self.data_parallel_size

    def init_dist_group(self):
-        '''Initialize data parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize data parallel groups, and assign local_ranks and groups to each gpu.

-        :return: data parallelism's information 
-        :rtype: tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: Data parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
--- a/colossalai/context/process_group_initializer/initializer_model.py
+++ b/colossalai/context/process_group_initializer/initializer_model.py
@ -11,8 +11,12 @@ from ..parallel_mode import ParallelMode

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Model(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel groups).
-    '''
+    """A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel
+    groups).
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -20,11 +24,11 @@ class Initializer_Model(ProcessGroupInitializer):
        self.num_group = self.world_size // self.model_parallel_size

    def init_dist_group(self):
-        '''Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
+        """Initialize model parallel groups, and assign local_ranks and groups to each gpu.

        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        :rtype: tuple
-        '''
+        :rtype: Tuple
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
--- a/colossalai/context/process_group_initializer/initializer_moe.py
+++ b/colossalai/context/process_group_initializer/initializer_moe.py
@ -9,8 +9,15 @@ from ..parallel_mode import ParallelMode
@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Moemodel(ProcessGroupInitializer):
    """Model parallel initialization for MoE system.
-    """

+    :param moe_moel: Size of moe model parallel
+    :param moe_data: Size of moe data parallel
+    :param args: Args used in base class
+    :param kwargs: Kwargs used in base class
+
+    :type moe_model: int
+    :type moe_data: int
+    """
    def __init__(self, moe_model, moe_data, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.moe_model = moe_model
@ -19,8 +26,10 @@ class Initializer_Moemodel(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize model parallel groups in moe parallel environment,
        and assign local_ranks and groups to each gpu.
-        """

+        :return: MoE model parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -43,8 +52,15 @@ class Initializer_Moemodel(ProcessGroupInitializer):
@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Moedata(ProcessGroupInitializer):
    """Data parallel initialization for MoE system.
-    """

+    :param moe_moel: Size of moe model parallel
+    :param moe_data: Size of moe data parallel
+    :param args: Args used in base class
+    :param kwargs: Kwargs used in base class
+
+    :type moe_model: int
+    :type moe_data: int
+    """
    def __init__(self, moe_model, moe_data, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.moe_model = moe_model
@ -53,8 +69,10 @@ class Initializer_Moedata(ProcessGroupInitializer):
    def init_dist_group(self):
        """Initialize data parallel groups in moe parallel environment,
        and assign local_ranks and groups to each gpu.
-        """

+        :return: MoE data parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -77,8 +95,10 @@ class Initializer_Moedata(ProcessGroupInitializer):
@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Moe(ProcessGroupInitializer):
    """Serves as the single entry point to MoE parallel initialization.
-    """

+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.moe_model = moe_env.model_parallel_size
@ -90,8 +110,10 @@ class Initializer_Moe(ProcessGroupInitializer):

    def init_dist_group(self):
        """Initializes MoE parallel communication groups.
-        """

+        :return: MoE parallelism's information
+        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        parallel_setting = [self.model_initializer.init_dist_group(),
                            self.data_initializer.init_dist_group()]
        return parallel_setting
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@ -10,12 +10,22 @@ from ..parallel_mode import ParallelMode

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Pipeline(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for pipeline parallelism.
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.data_group_size = self.world_size // self.data_parallel_size
        self.pipeline_stage_size = self.data_group_size // self.pipeline_parallel_size

    def init_dist_group(self):
+        """Initialize pipeline parallel groups, and assign local_ranks and groups to each gpu.
+
+        :return: Pipeline parallelism's information
+        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        dist_settings = list()
        for i in range(self.data_parallel_size):
            for j in range(self.pipeline_stage_size):
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@ -10,12 +10,14 @@ from ..parallel_mode import ParallelMode

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Sequence_DP(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for sequence parallelism all-reduce.
+    """A ProcessGroupInitializer for sequence parallelism all-reduce.

-    In Sequence Parallelism, each GPU holds the full copy of model weights, 
+    In Sequence Parallelism, each GPU holds the full copy of model weights,
    thus, gradient all-reduce occurs across all processes in the same pipeline stage

-    '''
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -23,10 +25,11 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):
        self.num_group = self.pipeline_parallel_size

    def init_dist_group(self):
-        '''Initialize Sequence Parallel process groups used for gradient all-reduce.
+        """Initialize Sequence Parallel process groups used for gradient all-reduce.
+
        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        :rtype: tuple
-        '''
+        :rtype: Tuple
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
@ -47,9 +50,11 @@ class Initializer_Sequence_DP(ProcessGroupInitializer):

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Sequence(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for sequence parallelism.
-    '''
+    """A ProcessGroupInitializer for sequence parallelism.

+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """
    def __init__(self,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -58,15 +63,15 @@ class Initializer_Sequence(ProcessGroupInitializer):
        self._sequence_dp_initializer = Initializer_Sequence_DP(*args, **kwargs)

    def init_dist_group(self):
-        '''Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu.
+        """Initialize Sequence parallel process groups and assign local_ranks and groups to each gpu.

        Sequence parallelism requires 2 process groups. The first is for model forward where several processes
        exchange paritial query, key and value embedding to compute self attention values. The second is for
        all-reduce to synchronize the model parameters.

-        :return: 2D tensor parallelism's information 
-        :rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: Sequence parallelism's information
+        :rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """

        parallel_setting = []

--- a/colossalai/context/process_group_initializer/initializer_tensor.py
+++ b/colossalai/context/process_group_initializer/initializer_tensor.py
@ -10,18 +10,21 @@ from ..parallel_mode import ParallelMode

@DIST_GROUP_INITIALIZER.register_module
 class Initializer_Tensor(ProcessGroupInitializer):
-    '''A ProcessGroupInitializer for tensor parallelism.
-    '''
+    """A ProcessGroupInitializer for tensor parallelism.
+
+    :param args: Args used to initialize ProcessGroupInitializer
+    :param kwargs: Kwargs used to initialize ProcessGroupInitializer
+    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_tensor_parallel_group = self.world_size // self.tensor_parallel_size

    def init_dist_group(self):
-        '''Initialize tensor parallel groups, and assign local_ranks and groups to each gpu. 
+        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.

-        :return: tensor parallelism's information 
-        :rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
-        '''
+        :return: Tensor parallelism's information
+        :rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
+        """
        local_rank = None
        ranks_in_group = None
        process_group = None
--- a/colossalai/context/process_group_initializer/process_group_initializer.py
+++ b/colossalai/context/process_group_initializer/process_group_initializer.py
@ -7,21 +7,35 @@ from colossalai.context import Config


 class ProcessGroupInitializer(ABC):
-    '''An object, knowing the parallelism configuration, that initializes parallel groups.
-    '''
+    """An object, knowing the parallelism configuration, that initializes parallel groups.
+
+    :param rank: The rank of current process
+    :param world_size: Size of whole communication world
+    :param config: Running configuration
+    :param data_parallel_size: Size of data parallel
+    :param pipeline_parallel_size: Size of pipeline parallel
+    :param tensor_parallel_size: Size of tensor parallel
+
+    :type rank: int
+    :type world_size: int
+    :type config: Config
+    :type data_parallel_size: int
+    :type pipeline_parallel_size: int
+    :type tensor_parallel_size: int
+    """
    def __init__(self,
                 rank: int,
                 world_size: int,
                 config: Config,
                 data_parallel_size: int,
-                 pipeline_parlalel_size: int,
+                 pipeline_parallel_size: int,
                 tensor_parallel_size: int
                 ):
        self.rank = rank
        self.world_size = world_size
        self.data_parallel_size = data_parallel_size
        self.config = config
-        self.pipeline_parallel_size = pipeline_parlalel_size
+        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        super().__init__()

--- a/colossalai/context/random/seed_manager.py
+++ b/colossalai/context/random/seed_manager.py
@ -61,6 +61,8 @@ class SeedManager:
        :type parallel_mode: :class:`colossalai.context.ParallelMode`
        :param seed: The seed to be added
        :type seed: int
+        :param overwrtie: Whether allows to overwrite the seed that has been set already
+        :type overwrtie: bool, optional
        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of 
            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
        """
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -22,8 +22,10 @@ class Engine:
    :type optimizer: ``torch.optim.Optimizer``
    :param criterion: Loss function for calculating loss
    :type criterion: ``torch.nn.modules.loss._Loss``
-    :param gradient_clipping: The norm of gradient clipping
-    :type gradient_clipping: float, optional
+    :param gradient_handlers: A list of gradient handler used in backward
+    :type gradient_handlers: list
+    :param clip_grad_norm: The norm of gradient clipping
+    :type clip_grad_norm: float, optional
    :param verbose: whether to display log info
    :type verbose: bool
    """
@ -54,26 +56,26 @@ class Engine:

    @property
    def model(self):
-        """model attached to the engine"""
+        """Model attached to the engine"""
        return self._model

    @property
    def optimizer(self):
-        """optimizer attached to the engine"""
+        """Optimizer attached to the engine"""
        return self._optimizer

    @property
    def criterion(self):
-        """criterion attached to the engine"""
+        """Criterion attached to the engine"""
        return self._criterion

    def zero_grad(self):
-        """set the gradient of parameters to zero
+        """Set the gradient of parameters to zero
        """
        self.optimizer.zero_grad()

    def step(self):
-        """execute parameter update
+        """Execute parameter update
        """
        self._all_reduce_gradients()
        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
@ -82,7 +84,7 @@ class Engine:
    def backward(self, loss: Tensor):
        """Start backward propagation given the loss value computed by a loss function

-        :param loss: loss value computed by a loss function
+        :param loss: Loss value computed by a loss function
        :type loss: :class:`torch.Tensor`
        """
        return self.optimizer.backward(loss)
@ -90,23 +92,28 @@ class Engine:
    def backward_by_grad(self, tensor, grad):
        """Start backward propagation given the gradient of the output tensor

-        :param loss: output tensor
-        :type loss: :class:`torch.Tensor`
-        :param grad: gradient passed back to the output
+        :param tensor: Output tensor
+        :type tensor: :class:`torch.Tensor`
+        :param grad: Gradient passed back to the output
        :type grad: :class:`torch.Tensor`
        """
        return self.optimizer.backward_by_grad(tensor, grad)

    def calc_loss(self, *args, **kwargs):
-        """compute the loss value
-        :return: the loss value
+        """Compute the loss value
+
+        :param args: Args used in criterion function
+        :param kwargs: Kwargs used in criterion function
+
+        :return: The loss value
        :rtype: :class:`torch.Tensor`
        """
        return self.criterion(*args, **kwargs)

    def __call__(self, *args, **kwargs):
-        """run the forward step for the model
-        :return: output the model
+        """Run the forward step for the model
+
+        :return: Output the model
        :rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
        """
        return self.model(*args, **kwargs)
--- a/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_moe_gradient_handler.py
@ -10,7 +10,7 @@ from ...context.parallel_mode import ParallelMode
@GRADIENT_HANDLER.register_module
 class MoeGradientHandler(BaseGradientHandler):
    """A helper class to handle all-reduce operations in a data parallel group and
-    moe tensor parallel. A all-reduce collective communication will be operated in
+    moe model parallel. A all-reduce collective communication will be operated in
    :func:`handle_gradient` among a data parallel group.
    For better performance, it bucketizes the gradients of all parameters that are
    the same type to improve the efficiency of communication.
@ -19,7 +19,7 @@ class MoeGradientHandler(BaseGradientHandler):
    def handle_gradient(self):
        """A method running an all-reduce operation in a data parallel group.
        Then running an all-reduce operation for all parameters in experts
-        across moe tensor parallel group
+        across moe model parallel group
        """
        moe_data = moe_env.data_parallel_size
        global_data = gpc.data_parallel_size
--- a/colossalai/engine/schedule/init.py
+++ b/colossalai/engine/schedule/init.py
@ -2,4 +2,4 @@ from ._base_schedule import BaseSchedule
 from ._pipeline_schedule import PipelineSchedule, InterleavedPipelineSchedule
 from ._non_pipeline_schedule import NonPipelineSchedule

-__all__ = ['BaseSchedule', 'PipelineSchedule', 'NonPipelineSchedule', 'InterleavedPipelineSchedule']
+__all__ = ['BaseSchedule', 'NonPipelineSchedule', 'PipelineSchedule', 'InterleavedPipelineSchedule']
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@ -5,12 +5,13 @@ from abc import ABC, abstractmethod

 import torch

-from typing import Iterable,  Callable
+from typing import Iterable, Callable
 from .._base_engine import Engine
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
 from colossalai.nn.layer import split_batch

+
 class BaseSchedule(ABC):
    """A basic helper class to control the process of training or evaluation.
    It mainly composes of forward_backward_step for gradient backward and
@ -46,6 +47,11 @@ class BaseSchedule(ABC):
        """Loads a batch from data iterator. It returns the data and labels which are
        already in the same GPU as where the model's.

+        :param data_iter: Data iterator from which get a batch of data
+        :type data_iter: DataIter
+        :param to_gpu: Whether the data should be moved to GPU
+        :type to_gpu: bool, optional
+
        :return: (data, label)
        :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
        """
@ -62,13 +68,12 @@ class BaseSchedule(ABC):
        if isinstance(data, torch.Tensor):
            self.batch_size = data.size(0)
        else:
-            self.batch_size = next(iter(data.values())).size(0)    
+            self.batch_size = next(iter(data.values())).size(0)
        data, label = split_batch(data), split_batch(label)
        if to_gpu:
            return self._move_to_device(data), self._move_to_device(label)
        return data, label

-
    def pre_processing(self, engine: Engine):
        """To perform actions before running the schedule.
        """
@ -85,11 +90,15 @@ class BaseSchedule(ABC):
        """The process function over a batch of dataset for training or evaluation.

        :param engine: Colossalai training engine
-        :param inputs: input data
-        :param labels: ground truth
+        :type engine: colossalai.engine.Engine
+        :param data_iter: Data iterator from which get a batch of data
+        :type data_iter: DataIter
        :param forward_only: If True, the process won't include backward
+        :type forward_only: bool
        :param return_loss: If False, the loss won't be returned
+        :type return_loss: bool, optional
        :param return_output_label: If False, the output and label won't be returned
+        :type return_output_label: bool, optional
        """
        pass

@ -105,7 +114,7 @@ class BaseSchedule(ABC):
        assert isinstance(outputs, (torch.Tensor, list, tuple)
                          ), f'Expect output of model is (torch.Tensor, list, tuple), got {type(outputs)}'
        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs, )
+            outputs = (outputs,)
        if isinstance(labels, torch.Tensor):
            return engine.criterion(*outputs, labels)
        else:
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@ -15,10 +15,6 @@ class NonPipelineSchedule(BaseSchedule):
    During one process, it loads a batch of dataset and feeds it to the model.
    After getting the output and calculating the loss, it will use :meth:`step`
    to update the parameters if it is in training mode.
-    :param amp_type: The type of automatic mixed precision
-    :param amp_config: The configuration of automatic mixed procision
-    :type amp_type: AMP_TYPE
-    :type amp_config: dict
    """

    def forward_backward_step(self,
@ -29,6 +25,7 @@ class NonPipelineSchedule(BaseSchedule):
                              return_output_label: bool = True):
        """The process function that loads loads a batch of dataset and feeds it to the model.
        The returned labels and loss will None if :attr:`return_loss` is False.
+
        :param engine: Model for training and inference
        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@ -44,9 +44,11 @@ class PipelineSchedule(BaseSchedule):
    :param num_microbatches: The number of microbatches
    :type num_microbatches: int
    :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
-    :type batch_data_process_func: Callable
+    :type batch_data_process_func: Callable, optional
+    :param tensor_shape: Specified shape in pipeline communication
+    :type tensor_shape: torch.Size, optional
    :param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
-    :type scatter_gather_tensors: bool
+    :type scatter_gather_tensors: bool, optional
    """

    def __init__(self,
@ -130,12 +132,16 @@ class PipelineSchedule(BaseSchedule):
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.

-        :param engine: your engine object
+        :param engine: Your engine object
        :type engine: colossalai.engine.Engine
-        :param input_tensor: input tensor for this pipeline stage
+        :param input_tensor: Input tensor for this pipeline stage
        :type input_tensor: :class:`torch.Tensor`
-        :param return_tensors: a list of tensors to return
+        :param return_tensors: A list of tensors to return
        :type return_tensors: List[:class:`torch.Tensor`]
+        :param return_output_label: Whether returns output labels
+        :type return_output_label: bool, optional
+        :param accum_loss: Where accumulated loss stores
+        :type  accum_loss: optional

        :return: output or the loss value of the current pipeline stage
        :rtype: :class:`torch.Tensor`
@ -205,13 +211,13 @@ class PipelineSchedule(BaseSchedule):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

-        :param engine: your engine object
+        :param engine: Your engine object
        :type engine: colossalai.engine.Engine
-        :param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
+        :param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
        :type data_iter: Iterable
-        :param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
+        :param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
        :type forward_only: bool
-        :param return_loss: whether returns the loss value. Default is true.
+        :param return_loss: Whether returns the loss value. Default is true.
        :type return_loss: bool
        :param return_output_label: If False, the output and label won't be returned
        :type return_output_label: bool
@ -357,9 +363,11 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        :param num_model_chunks: The number of model chunks
        :type num_model_chunks: int
        :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
-        :type batch_data_process_func: Callable
+        :type batch_data_process_func: Callable, optional
+        :param tensor_shape: Specified shape in pipeline communication
+        :type tensor_shape: torch.Size, optional
        :param scatter_gather_tensors: If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization
-        :type scatter_gather_tensors: bool
+        :type scatter_gather_tensors: bool, optional
        """
        assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
            'num_microbatches must be an integer multiple of pipeline parallel world size'
@ -425,7 +433,19 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        """Run interleaved 1F1B schedule (model split into model chunks), with
        communication between pipeline stages as needed.

-        Returns dictionary with losses if the last stage, empty dict otherwise."""
+        Returns dictionary with losses if the last stage, empty dict otherwise.
+
+        :param engine: Your engine object
+        :type engine: colossalai.engine.Engine
+        :param data_iter: Dataloader as the form of an iterator, obtained by calling iter(dataloader)
+        :type data_iter: Iterable
+        :param forward_only: Whether run forward step only. Default is false. If true, no backward will be run.
+        :type forward_only: bool
+        :param return_loss: Whether returns the loss value. Default is true.
+        :type return_loss: bool
+        :param return_output_label: If False, the output and label won't be returned
+        :type return_output_label: bool
+        """
        assert forward_only or return_loss, \
            'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
        self.load_batch(data_iter)
--- a/colossalai/global_variables.py
+++ b/colossalai/global_variables.py
@ -1,7 +1,7 @@


 class MoeEnv:
-    """Moe enviroment variable.
+    """Moe enviroment variables.
    """

    def __init__(self):
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@ -29,12 +29,12 @@ from colossalai.global_variables import moe_env


 def get_default_parser():
-    '''Reads user command line and uses an argument parser to parse the input arguments.
+    """Reads user command line and uses an argument parser to parse the input arguments.
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.

-    :return: returns the parser with the default arguments, the user may add customized arguments into this parser
+    :return: Returns the parser with the default arguments, the user may add customized arguments into this parser
    :rtype: Namespace
-    '''
+    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, help='path to the config file')
    parser.add_argument('--host',
@ -64,28 +64,30 @@ def launch(config: Union[str, Path, Config, Dict],
           local_rank: int = None,
           seed: int = 1024,
           verbose: bool = True):
-    '''This function first parses the configuration arguments, using :func:parse_args() in case one of the input arguments are not given.
-    Then initialize and set distributed environment by calling global_context's functions.
+    """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
+    arguments are not given. Then initialize and set distributed environment by calling global_context's functions.

-    :param config: config file or config file path are both acceptable
+    :param config: Config file or config file path are both acceptable
    :type config: Union[str, dict, Config]
-    :param rank: rank for the default process group
+    :param rank: Rank for the default process group
    :type rank: int
-    :param world_size: world size of the default process group
+    :param world_size: World size of the default process group
    :type world_size: int
-    :param host: the master address for distributed training
+    :param host: The master address for distributed training
    :type host: str
-    :param port: the master port for distributed training
+    :param port: The master port for distributed training
    :type port: str
-    :param backend: backend for torch.distributed
-    :type backend: str
-    :param local_rank: rank for the process on the node and is used to set the default CUDA device,
-    defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
+    :param backend: Backend for torch.distributed
+    :type backend: str, optional
+    :param local_rank: Rank for the process on the node and is used to set the default CUDA device, defaults to None.
+        If local_rank = None, the default device ordinal will be calculated automatically
    :type local_rank: int, optional
-    :param verbose: whether to print logs
-    :type verbose: bool
-    :raises Exception: raise exception when config type is wrong
-    '''
+    :param seed: Specified random seed for every processes
+    :type seed: int, optional
+    :param verbose: Whether to print logs
+    :type verbose: bool, optional
+    :raises Exception: Raise exception when config type is wrong
+    """
    gpc.verbose = verbose

    # set config
@ -123,20 +125,22 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
                      backend: str = 'nccl',
                      seed: int = 1024,
                      verbose: bool = True):
-    '''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
+    """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
    set by SLURM

-    :param config: config file or config file path are both acceptable
+    :param config: Config file or config file path are both acceptable
    :type config: Union[str, dict, Config]
-    :param host: the master address for distributed training
+    :param host: The master address for distributed training
    :type host: str
-    :param port: the master port for distributed training
+    :param port: The master port for distributed training
    :type port: str
-    :param backend: backend for torch.distributed
-    :type backend: str
-    :param verbose: whether to print logs
-    :type verbose: bool
-    '''
+    :param backend: Backend for torch.distributed
+    :type backend: str, optional
+    :param seed: Specified random seed for every processes
+    :type seed: int, optional
+    :param verbose: Whether to print logs
+    :type verbose: bool, optional
+    """
    rank = int(os.environ['SLURM_PROCID'])
    world_size = int(os.environ['SLURM_NPROCS'])
    launch(config=config,
@ -155,20 +159,22 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
                        backend: str = 'nccl',
                        seed: int = 1024,
                        verbose: bool = True):
-    '''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
+    """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
    set by OpenMPI

-    :param config: config file or config file path are both acceptable
+    :param config: Config file or config file path are both acceptable
    :type config: Union[str, dict, Config]
-    :param host: the master address for distributed training
+    :param host: The master address for distributed training
    :type host: str
-    :param port: the master port for distributed training
+    :param port: The master port for distributed training
    :type port: str
-    :param backend: backend for torch.distributed
-    :type backend: str
-    :param verbose: whether to print logs
-    :type verbose: bool
-    '''
+    :param backend: Backend for torch.distributed
+    :type backend: str, optional
+    :param seed: Specified random seed for every processes
+    :type seed: int, optional
+    :param verbose: Whether to print logs
+    :type verbose: bool, optional
+    """
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
    world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@ -187,20 +193,18 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
                      backend: str = 'nccl',
                      seed: int = 1024,
                      verbose: bool = True):
-    '''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
+    """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch

-    :param config: config file or config file path are both acceptable
+    :param config: Config file or config file path are both acceptable
    :type config: Union[str, dict, Config]
-    :param host: the master address for distributed training
-    :type host: str
-    :param port: the master port for distributed training
-    :type port: str
-    :param backend: backend for torch.distributed
-    :type backend: str
-    :param verbose: whether to print logs
-    :type verbose: bool
-    '''
+    :param backend: Backend for torch.distributed
+    :type backend: str, optional
+    :param seed: Specified random seed for every processes
+    :type seed: int, optional
+    :param verbose: Whether to print logs
+    :type verbose: bool, optional
+    """
    rank = int(os.environ['RANK'])
    local_rank = int(os.environ['LOCAL_RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
@ -225,25 +229,26 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
               lr_scheduler: _LRScheduler = None,
               verbose: bool = True
               ) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
-    ''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
+    """Core function to wrap the essential training components with our functionality based on the config which is
+    loaded into gpc.config.

-    :param model: your model instance
+    :param model: Your model instance
    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer instance
+    :param optimizer: Your optimizer instance
    :type optimizer: :class:`torch.optim.optimizer.Optimizer`
-    :param criterion: your criterion instance
+    :param criterion: Your criterion instance
    :type criterion: :class:`torch.nn.modules.loss._Loss`
-    :param train_dataloader: dataloader for training data
-    :type train_dataloader: :class:`torch.utils.data.DataLoader`
-    :param train_dataloader: dataloader for testing data
-    :type train_dataloader: :class:`torch.utils.data.DataLoader`
-    :param lr_scheduler: your lr scheduler instance
-    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
-    :param verbose: whether to print logs
-    :type verbose: bool
+    :param train_dataloader: Dataloader for training
+    :type train_dataloader: :class:`torch.utils.data.DataLoader`, optional
+    :param test_dataloader: Dataloader for testing
+    :type test_dataloader: :class:`torch.utils.data.DataLoader`, optional
+    :param lr_scheduler: Your lr scheduler instance
+    :type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`, optional
+    :param verbose: Whether to print logs
+    :type verbose: bool, optional
    :return: (engine, train_dataloader, test_dataloader, lr_scheduler)
-    :rtype: tuple
-    '''
+    :rtype: Tuple
+    """
    # get logger
    logger = get_dist_logger()
    gpc.verbose = verbose
--- a/colossalai/kernel/cuda_native/multihead_attention.py
+++ b/colossalai/kernel/cuda_native/multihead_attention.py
@ -106,8 +106,10 @@ class MultiHeadAttention(nn.Module):
    """Initialize the MultiHeadAttention.

    Static variable:
+
        layer_id: The layer-index counter starting from 0 and incrementing by 1 every time a layer object is instantiated,
        e.g. if a model has 24 transformer layers, layer_id goes from 0 to 23.
+
    Arguments:
        hidden_size: Total dimension of hidden_size.
        nhead: Number of parallel attention heads.
--- a/colossalai/kernel/cuda_native/scaled_softmax.py
+++ b/colossalai/kernel/cuda_native/scaled_softmax.py
@ -14,9 +14,10 @@ class AttnMaskType(enum.Enum):
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    """
    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply upper triangular mask (typically used in gpt models).
-    3. Perform softmax.
+
+        1.  Scale the tensor.
+        2.  Apply upper triangular mask (typically used in gpt models).
+        3.  Perform softmax.
    """

    @staticmethod
@ -52,9 +53,10 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
 class ScaledMaskedSoftmax(torch.autograd.Function):
    """
    Fused operation which performs following three operations in sequence
-    1. Scale the tensor.
-    2. Apply the mask.
-    3. Perform softmax.
+
+        1.  Scale the tensor.
+        2.  Apply the mask.
+        3.  Perform softmax.
    """

    @staticmethod
@ -87,16 +89,16 @@ class ScaledMaskedSoftmax(torch.autograd.Function):

 class FusedScaleMaskSoftmax(nn.Module):
    """
-    fused operation: scaling + mask + softmax
+    Fused operation: scaling + mask + softmax

    Arguments:
-        input_in_fp16: flag to indicate if input in fp16 data format.
-        input_in_bf16: flag to indicate if input in bf16 data format.
-        attn_mask_type: attention mask type (pad or causal)
-        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
-        mask_func: mask function to be applied.
-        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-        scale: scaling factor used in input tensor scaling.
+        input_in_fp16: Flag to indicate if input in fp16 data format.
+        input_in_bf16: Flag to indicate if input in bf16 data format.
+        attn_mask_type: Attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: Flag to indicate user want to use softmax fusion
+        mask_func: Mask function to be applied.
+        softmax_in_fp32: If True, softmax in performed at fp32 precision.
+        scale: Scaling factor used in input tensor scaling.
    """

    def __init__(
--- a/colossalai/logging/logging.py
+++ b/colossalai/logging/logging.py
@ -25,9 +25,10 @@ class DistributedLogger:
    @staticmethod
    def get_instance(name: str):
        """Get the unique single logger instance based on name.
+
        :param name: The name of the logger
        :type name: str
-        :return: a DistributedLogger object
+        :return: A DistributedLogger object
        :rtype: DistributedLogger
        """
        if name in DistributedLogger.__instances:
@ -50,7 +51,8 @@ class DistributedLogger:

    def set_level(self, level: str):
        """Set the logging level
-        :param level: can only be INFO, DEBUG, WARNING and ERROR
+
+        :param level: Can only be INFO, DEBUG, WARNING and ERROR
        :type level: str
        """
        self._check_valid_logging_level(level)
@ -62,12 +64,15 @@ class DistributedLogger:
                    level: str = 'INFO',
                    suffix: str = None):
        """Save the logs to file
-        :param path: the file to save the log
-        :type path: a string or pathlib.Path object
-        :param mode: the mode to write log into the file
+
+        :param path: The file to save the log
+        :type path: A string or pathlib.Path object
+        :param mode: The mode to write log into the file
        :type mode: str
-        :param level: can only be INFO, DEBUG, WARNING and ERROR
+        :param level: Can only be INFO, DEBUG, WARNING and ERROR
        :type level: str
+        :param suffix: The suffix string of log's name
+        :type suffix: str
        """
        assert isinstance(path, (str, Path)), \
            f'expected argument path to be type str or Path, but got {type(path)}'
@ -105,12 +110,12 @@ class DistributedLogger:
    def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None):
        """Log an info message.

-        :param message:
-        :type message:
-        :param parallel_mode:
-        :type parallel_mode:
-        :param ranks:
-        :type ranks:
+        :param message: The message to be logged
+        :type message: str
+        :param parallel_mode: The parallel mode used for logging. Defaults to ParallelMode.GLOBAL
+        :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`
+        :param ranks: List of parallel ranks
+        :type ranks: list
        """
        self._log('info', message, parallel_mode, ranks)

--- a/colossalai/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/nn/layer/colossalai_layer/embedding.py
@ -37,6 +37,8 @@ class Embedding(nn.Module):
    :type dtype: torch.dtype, optional
    :param weight_initializer: The intializer of weight, defaults to normal initializer
    :type weight_initializer: typing.Callable, optional
+    :param args: Args used in F.embedding
+    :param kwargs: Kwargs used in F.embedding
    """
    def __init__(self,
                 num_embeddings: int,
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@ -40,6 +40,7 @@ class Linear(nn.Module):
    :type weight_initializer: typing.Callable, optional
    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
    :type bias_initializer: typing.Callable, optional
+    :param kwargs: Kwargs used for initialization
    """
    def __init__(self,
                 in_features: int,
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@ -12,8 +12,12 @@ from ._operation import AllToAll

 class NormalNoiseGenerator:
    """Generates a random noisy mask for logtis tensor.
+
    All noise is generated from a normal distribution (0, 1 / E^2), where
    E = the number of experts.
+
+    :param num_experts: The number of experts
+    :type num_experts: int
    """

    def __init__(self, num_experts: int):
@ -31,6 +35,12 @@ class Experts(nn.Module):
    """A wrapper class to create experts. It will create E experts across the
    moe model parallel group, where E is the number of experts. Every expert
    is a instence of the class, 'expert' in initialization parameters.
+
+    :param expert: The class of all experts
+    :param num_experts: The number of experts
+    :param expert_args: Args used to initialize experts
+
+    :type num_experts: int
    """

    def __init__(self, expert, num_experts, **expert_args):
@ -63,6 +73,14 @@ class Top1Router(nn.Module):
    """Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
    for routing usage. More deailted function can be found in the paper about Switch Transformer
    of Google.
+
+    :param capacity_factor: Capacity factor in routing
+    :param min_capacity: The minimum number of the capacity of each expert
+    :param noisy_func: Noisy function used in logits
+
+    :type capacity_factor: float
+    :type min_capacity: int
+    :type noisy_func: Callable, optional
    """

    def __init__(self,
@ -127,6 +145,12 @@ class Top1Router(nn.Module):
 class Top2Router(nn.Module):
    """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
    for routing usage. More deailted function can be found in the paper about ViT-MoE.
+
+    :param capacity_factor: Capacity factor in routing
+    :param noisy_func: Noisy function used in logits
+
+    :type capacity_factor: float
+    :type noisy_func: Callable, optional
    """

    def __init__(self, capacity_factor: float, noisy_func=None):
@ -189,6 +213,16 @@ class MoeLayer(nn.Module):
    to router all tokens, is mainly used to exchange all tokens for every expert across
    the moe tensor group by all to all comunication. Then it will get the output of all
    experts and exchange the output. At last returns the output of the moe system.
+
+    :param dim_model: Dimension of model
+    :param num_experts: The number of experts
+    :param router: Instance of router used in routing
+    :param experts: Instance of experts generated by Expert
+
+    :type dim_model: int
+    :type num_experts: int
+    :type router: nn.Module
+    :type experts: nn.Module
    """

    def __init__(self,
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@ -186,6 +186,12 @@ class Linear1D_Col(ParallelLayer):
                    to all GPUs, otherwise, every GPU will have its output
                    which is :math:`Y_i = XA_i`, defaults to False
    :type gather_output: bool, optional
+    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
+    :type skip_bias_add: bool, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
    """

    def __init__(self,
@ -268,6 +274,12 @@ class Linear1D_Row(ParallelLayer):
    :type dtype: torch.dtype, optional
    :param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
    :type parallel_input: bool, optional
+    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
+    :type skip_bias_add: bool, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
    """

    def __init__(self,
@ -383,6 +395,8 @@ class Embedding1D(ParallelLayer):
    :type dtype: torch.dtype, optional
    :param weight_initializer: The intializer of weight, defaults to normal initializer
    :type weight_initializer: typing.Callable, optional
+    :param args: Args used in F.embedding
+    :param kwargs: Kwargs used in F.embedding
    """
    def __init__(self,
                 num_embeddings: int,
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@ -771,6 +771,17 @@ class SplitFirst(torch.autograd.Function):


 def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
+    """Splits 2D tensor in specified dimension across cols
+
+    :param input_: Input tensor
+    :param dim: Specified dimension in which to split
+
+    :type input_: torch.Tensor
+    :type dim: int, optional
+
+    :return output: Splitted tensor
+    :rtype output: torch.Tensor
+    """
    if input_.size(dim) <= 1:
        return input_
    return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2D_COL),
@ -778,13 +789,7 @@ def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:


 class reduce_by_batch_2d(torch.autograd.Function):
-    """
-    All-reduce the input from the model parallel region.
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
-    :type reduce_mean: int, optional
+    """All-reduce the input from the model parallel region.
    """
    @staticmethod
    def symbolic(graph, input_, reduce_mean: bool = False):
@ -797,6 +802,12 @@ class reduce_by_batch_2d(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, input_, reduce_mean: bool = False):
+        """
+        :param input_: input maxtrix
+        :type input_: torch.tensor
+        :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
+        :type reduce_mean: int, optional
+        """
        output = all_reduce(input_, ParallelMode.PARALLEL_2D_COL)
        ctx.reduce_mean = reduce_mean
        if reduce_mean:
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@ -303,6 +303,8 @@ class Embedding2D(ParallelLayer):
    :type dtype: torch.dtype, optional
    :param weight_initializer: The intializer of weight, defaults to normal initializer
    :type weight_initializer: typing.Callable, optional
+    :param args: Args used in F.embedding
+    :param kwargs: Kwargs used in F.embedding
    """
    def __init__(self,
                 num_embeddings: int,
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@ -733,6 +733,17 @@ class SplitFirst(torch.autograd.Function):


 def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
+    """Splits 2P5D tensor in specified dimension across cols
+
+    :param input_: Input tensor
+    :param dim: Specified dimension in which to split
+
+    :type input_: torch.Tensor
+    :type dim: int, optional
+
+    :return output: Splitted tensor
+    :rtype output: torch.Tensor
+    """
    if input_.size(dim) <= 1:
        return input_
    return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL),
@ -740,13 +751,7 @@ def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:


 class reduce_by_batch_2p5d(torch.autograd.Function):
-    """
-    All-reduce the input from the model parallel region.
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
-    :type reduce_mean: int, optional
+    """All-reduce the input from the model parallel region.
    """
    @staticmethod
    def symbolic(graph, input_, reduce_mean: bool = False):
@ -759,6 +764,12 @@ class reduce_by_batch_2p5d(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, input_, reduce_mean: bool = False):
+        """
+        :param input_: input maxtrix
+        :type input_: torch.tensor
+        :param reduce_mean:  If set to ``True``, it will divide the output by column parallel size, default to False
+        :type reduce_mean: int, optional
+        """
        output = all_reduce(input_, ParallelMode.PARALLEL_2P5D_COL)
        ctx.reduce_mean = reduce_mean
        if reduce_mean:
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@ -315,6 +315,8 @@ class Embedding2p5D(ParallelLayer):
    :type dtype: torch.dtype, optional
    :param weight_initializer: The intializer of weight, defaults to normal initializer
    :type weight_initializer: typing.Callable, optional
+    :param args: Args used in F.embedding
+    :param kwargs: Kwargs used in F.embedding
    """
    def __init__(self,
                 num_embeddings: int,
--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
@ -240,6 +240,21 @@ def split_tensor_3d(input_: Tensor,
                   dim: int = 0,
                   input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
                   weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
+    """Splits 3D tensor in specified dimension
+
+    :param input_: Input tensor
+    :param dim: Specified dimension in which to split
+    :param input_parallel_mode: Input parallel mode
+    :param weight_parallel_mode: Weight parallel mode
+
+    :type input_: torch.Tensor
+    :type dim: int, optional
+    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
+    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
+
+    :return output: Splitted tensor
+    :rtype output: torch.Tensor
+    """
    if input_.size(dim) <= 1:
        return input_
    output = torch.chunk(input_, gpc.get_world_size(weight_parallel_mode),
@ -250,17 +265,7 @@ def split_tensor_3d(input_: Tensor,


 class reduce_by_batch_3d(torch.autograd.Function):
-    """
-    All-reduce the input from the model parallel region.
-
-    :param input_: input maxtrix
-    :type input_: torch.tensor
-    :param input_parallel_mode: input parallel mode
-    :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param weight_parallel_mode: weight parallel mode
-    :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
-    :param reduce_mean:  If set to ``True``, it will divide the output by (input parallel size * weight parallel size), default to False
-    :type reduce_mean: int, optional
+    """All-reduce the input from the model parallel region.
    """
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
@ -269,6 +274,16 @@ class reduce_by_batch_3d(torch.autograd.Function):
                input_parallel_mode: ParallelMode,
                weight_parallel_mode: ParallelMode,
                reduce_mean: bool = False) -> Tensor:
+        """
+        :param input_: input maxtrix
+        :type input_: torch.tensor
+        :param input_parallel_mode: input parallel mode
+        :type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        :param weight_parallel_mode: weight parallel mode
+        :type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
+        :param reduce_mean:  If set to ``True``, it will divide the output by (input parallel size * weight parallel size), default to False
+        :type reduce_mean: int, optional
+        """
        output = all_reduce(input_, input_parallel_mode)
        output = all_reduce(output, weight_parallel_mode)
        ctx.reduce_mean = reduce_mean
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@ -338,6 +338,8 @@ class Embedding3D(ParallelLayer):
    :type dtype: torch.dtype, optional
    :param weight_initializer: The intializer of weight, defaults to normal initializer
    :type weight_initializer: typing.Callable, optional
+    :param args: Args used in F.embedding
+    :param kwargs: Kwargs used in F.embedding
    """
    def __init__(self,
                 num_embeddings: int,
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@ -37,7 +37,11 @@ class CheckpointModule(nn.Module):


 def divide(numerator, denominator):
-    """ only allow exact division """
+    """Only allow exact division
+
+    :param numerator: Numerator of the division
+    :param denominator: Denominator of the division
+    """
    assert numerator % denominator == 0, \
        '{} is not divisible by {}'.format(numerator, denominator)
    return numerator // denominator
--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@ -170,7 +170,7 @@ class VanillaPatchEmbedding(nn.Module):
@LAYERS.register_module
 class VanillaClassifier(nn.Module):
    """
-    Classifier
+    Classifier for ViT

    :param in_features: size of each input sample
    :type in_features: int
--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
@ -11,9 +11,9 @@ from colossalai.registry import LAYERS
 class LambdaWrapper(nn.Module):
    """Wrap a function to nn.Module, which takes a config of layers and can fully access them

-    :param func: user customed function
+    :param func: User customed function
    :type func: Callable
-    :param layers_cfg: config of layers, defaults to None
+    :param layers_cfg: Config of layers, defaults to None
    :type layers_cfg: dict, optional
    """

--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@ -11,6 +11,9 @@ class CrossEntropyLoss2D(_Loss):
    Cross entropy loss for 2D parallelism

    :param reduction: whether to average the loss, defaults to True
+    :param args: Args for loss function
+    :param kwargs: Kwargs for loss function
+
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True, *args, **kwargs):
@ -21,6 +24,11 @@ class CrossEntropyLoss2D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
+        """Calculate loss between logits and targets
+
+        :param logits: Output logits of model
+        :param targets: True targets from data
+        """
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
        if self.reduction_mean:
            loss = loss.mean()
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@ -11,6 +11,9 @@ class CrossEntropyLoss2p5D(_Loss):
    Cross entropy loss for 2.5D parallelism
    
    :param reduction: whether to average the loss, defaults to True
+    :param args: Args for loss function
+    :param kwargs: Kwargs for loss function
+
    :type reduction: bool, optional
    """
    def __init__(self, reduction=True, *args, **kwargs):
@ -21,6 +24,11 @@ class CrossEntropyLoss2p5D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
+        """Calculate loss between logits and targets
+
+        :param logits: Output logits of model
+        :param targets: True targets from data
+        """
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
        if self.reduction_mean:
            loss = loss.mean()
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@ -14,6 +14,9 @@ class CrossEntropyLoss3D(_Loss):
    :type depth: int
    :param reduction: whether to average the loss, defaults to True
    :type reduction: bool, optional
+
+    :param args: Args for loss function
+    :param kwargs: Kwargs for loss function
    """
    def __init__(self, reduction=True, *args, **kwargs):
        super().__init__()
@ -24,6 +27,11 @@ class CrossEntropyLoss3D(_Loss):
        self.loss_kwargs = kwargs

    def forward(self, logits, targets):
+        """Calculate loss between logits and targets
+
+        :param logits: Output logits of model
+        :param targets: True targets from data
+        """
        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
        if self.reduction_mean:
            loss = loss.mean()
--- a/colossalai/nn/loss/loss_moe.py
+++ b/colossalai/nn/loss/loss_moe.py
@ -7,6 +7,12 @@ from colossalai.global_variables import moe_env
@LOSSES.register_module
 class MoeCrossEntropyLoss(_Loss):
    """torch.nn.CrossEntropyLoss added with auxiliary loss.
+
+    :param aux_weight: Weight of auxiliary loss in total loss
+    :param args: Args in CrossEntropyLoss
+    :param kwargs: Kwargs in CrossEntropyLoss
+
+    :type aux_weight: float, optional
    """
    def __init__(self, aux_weight: float = 0.01, *args, **kwargs):
        super().__init__()
@ -22,6 +28,14 @@ class MoeCrossEntropyLoss(_Loss):
@LOSSES.register_module
 class MoeLoss(_Loss):
    """A wrapper class for any loss module to add with auxiliary loss.
+
+    :param aux_weight: Weight of auxiliary loss in total loss
+    :param loss_fn: Loss function
+    :param args: Args in loss function
+    :param kwargs: Kwargs in loss function
+
+    :type aux_weight: float
+    :type loss_fn: Callable
    """
    def __init__(self, aux_weight: float, loss_fn, *args, **kwargs):
        super().__init__()
--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@ -38,7 +38,7 @@ class CosineAnnealingLR(_CosineAnnealingLR):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional
@ -56,9 +56,9 @@ class CosineAnnealingWarmupLR(WarmupScheduler):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional
@ -78,9 +78,9 @@ class FlatAnnealingLR(DelayerScheduler):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param pct_start: percent of steps before starting learning rate decay
+    :param pct_start: Percent of steps before starting learning rate decay
    :type pct_start: float
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@ -99,15 +99,16 @@ class FlatAnnealingLR(DelayerScheduler):

@LR_SCHEDULERS.register_module
 class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
-    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied, and then the learning rate will be a fixed value before starting decay.
+    """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
+    applied, and then the learning rate will be a fixed value before starting decay.

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
-    :param pct_start: percent of steps before starting learning rate decay
+    :param pct_start: Percent of steps before starting learning rate decay
    :type pct_start: float
    :param eta_min: Minimum learning rate, defaults to 0
    :type eta_min: int, optional
--- a/colossalai/nn/lr_scheduler/delayed.py
+++ b/colossalai/nn/lr_scheduler/delayed.py
@ -18,9 +18,9 @@ class DelayerScheduler(_LRScheduler):

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
-    :param delay_epochs: number of epochs to keep the initial lr until starting aplying the scheduler
+    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
    :type delay_epochs: int
-    :param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@ -61,9 +61,9 @@ class WarmupScheduler(_LRScheduler):

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
-    :param warmup_epochs: number of epochs to linearly warmup lr until starting aplying the scheduler
+    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
    :type warmup_epochs: int
-    :param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@ -101,11 +101,11 @@ class WarmupDelayerScheduler(_LRScheduler):

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
-    :param warmup_epochs: number of epochs to linearly warmup lr until starting aplying the scheduler
+    :param warmup_epochs: Number of epochs to linearly warmup lr until starting aplying the scheduler
    :type warmup_epochs: int
-    :param delay_epochs: number of epochs to keep the initial lr until starting aplying the scheduler
+    :param delay_epochs: Number of epochs to keep the initial lr until starting aplying the scheduler
    :type delay_epochs: int
-    :param after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
+    :param after_scheduler: After target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    :type after_scheduler: torch.optim.lr_scheduler
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@ -9,9 +9,9 @@ class LinearWarmupLR(_LRScheduler):

    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@ -12,15 +12,16 @@ class MultiStepLR(_MultiStepLR):
    number of epoch reaches one of the milestones. Notice that such decay can
    happen simultaneously with other changes to the learning rate from outside
    this scheduler. When last_epoch=-1, sets initial lr as lr.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param milestones: List of epoch indices. Must be increasing, defaults to None
    :type milestones: List[int], optional
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
+    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@ -33,17 +34,18 @@ class MultiStepLR(_MultiStepLR):
@LR_SCHEDULERS.register_module
 class MultiStepWarmupLR(WarmupScheduler):
    """Multi-step laerning rate scheduler with warmup.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param milestones: List of epoch indices. Must be increasing, defaults to None
    :type milestones: List[int], optional
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
+    :param num_steps_per_epoch: Number of steps per epoch, defaults to -1
    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
--- a/colossalai/nn/lr_scheduler/onecycle.py
+++ b/colossalai/nn/lr_scheduler/onecycle.py
@ -17,19 +17,20 @@ class OneCycleLR(_OneCycleLR):
    This scheduler is not chainable.
    Note also that the total number of steps in the cycle can be determined in one
    of two ways (listed in order of precedence):
-    #. A value for total_steps is explicitly provided.
-    #. A number of epochs (epochs) and a number of steps per epoch
-       (steps_per_epoch) are provided.
-       In this case, the number of total steps is inferred by
-       total_steps = epochs * steps_per_epoch
+
+      * A value for total_steps is explicitly provided.
+      * A number of epochs (epochs) and a number of steps per epoch (steps_per_epoch) are provided.
+        In this case, the number of total steps is inferred by total_steps = epochs * steps_per_epoch
+
    You must either provide a value for total_steps or provide a value for both
    epochs and steps_per_epoch.
    The default behaviour of this scheduler follows the fastai implementation of 1cycle, which
    claims that "unpublished work has shown even better results by using only two phases". To
    mimic the behaviour of the original paper instead, set ``three_phase=True``.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param pct_start: The percentage of the cycle (in number of steps) spent increasing the learning rate, defaults to 0.3
    :type pct_start: float, optional
@ -64,6 +65,7 @@ class OneCycleLR(_OneCycleLR):
        number of *batches* computed, not the total number of epochs computed.
        When last_epoch=-1, the schedule is started from the beginning, defaults to -1
    :type last_epoch: int, optional
+
    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
        https://arxiv.org/abs/1708.07120
    """
--- a/colossalai/nn/lr_scheduler/poly.py
+++ b/colossalai/nn/lr_scheduler/poly.py
@ -7,13 +7,14 @@ from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
 class PolynomialLR(_LRScheduler):
    """Polynomial learning rate scheduler.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param end_lr: Minimum learning rate, defaults to 0.0001
    :type end_lr: float, optional
-    :param power: the power of polynomial, defaults to 1.0
+    :param power: The power of polynomial, defaults to 1.0
    :type power: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
@ -42,15 +43,16 @@ class PolynomialLR(_LRScheduler):
@LR_SCHEDULERS.register_module
 class PolynomialWarmupLR(WarmupScheduler):
    """Polynomial learning rate scheduler with warmup.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
-    :param warmup_steps: number of warmup steps, defaults to 0
+    :param warmup_steps: Number of warmup steps, defaults to 0
    :type warmup_steps: int, optional
    :param end_lr: Minimum learning rate, defaults to 0.0001
    :type end_lr: float, optional
-    :param power: the power of polynomial, defaults to 1.0
+    :param power: The power of polynomial, defaults to 1.0
    :type power: float, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
@ -10,16 +10,15 @@ from colossalai.registry import LR_SCHEDULERS
 class LambdaLR(_LambdaLR):
    """Sets the learning rate of each parameter group to the initial lr
    times a given function. When last_epoch=-1, sets initial lr as lr.
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param lr_lambda: A function which computes a multiplicative
        factor given an integer parameter epoch, or a list of such
        functions, one for each group in optimizer.param_groups, defaults to None
    :type lr_lambda: function or list, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
@ -32,16 +31,15 @@ class LambdaLR(_LambdaLR):
 class MultiplicativeLR(_MultiplicativeLR):
    """Multiply the learning rate of each parameter group by the factor given
    in the specified function. When last_epoch=-1, sets initial lr as lr
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param lr_lambda: A function which computes a multiplicative
        factor given an integer parameter epoch, or a list of such
        functions, one for each group in optimizer.param_groups, defaults to None
    :type lr_lambda: function or list, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
@ -56,16 +54,15 @@ class StepLR(_StepLR):
    step_size epochs. Notice that such decay can happen simultaneously with
    other changes to the learning rate from outside this scheduler. When
    last_epoch=-1, sets initial lr as lr
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param step_size: Period of learning rate decay, defaults to 1
    :type step_size: int, optional
    :param gamma: Multiplicative factor of learning rate decay, defaults to 0.1
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
@ -79,14 +76,13 @@ class StepLR(_StepLR):
 class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr
+
    :param optimizer: Wrapped optimizer
    :type optimizer: torch.optim.Optimizer
-    :param total_steps: number of total training steps
+    :param total_steps: Number of total training steps
    :type total_steps: int
    :param gamma: Multiplicative factor of learning rate decay, defaults to 1.0
    :type gamma: float, optional
-    :param num_steps_per_epoch: number of steps per epoch, defaults to -1
-    :type num_steps_per_epoch: int, optional
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/nn/metric/accuracy_2d.py
@ -6,16 +6,17 @@ from ._utils import calc_acc


 class Accuracy2D(nn.Module):
-    """
-    Accuracy for 2D parallelism
-
-    :param logits: predicted labels
-    :param targets: true labels
+    """Accuracy for 2D parallelism
    """
    def __init__(self):
        super().__init__()

    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        :param logits: Predicted labels
+        :param targets: True labels from data
+        """
        with torch.no_grad():
            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_2d.apply(correct)
--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/nn/metric/accuracy_2p5d.py
@ -6,16 +6,17 @@ from ._utils import calc_acc


 class Accuracy2p5D(nn.Module):
-    """
-    Accuracy for 2p5D parallelism
-
-    :param logits: predicted labels
-    :param targets: true labels
+    """Accuracy for 2p5D parallelism
    """
    def __init__(self):
        super().__init__()

    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+        :param logits: Predicted labels
+        :param targets: True labels from data
+        """
        with torch.no_grad():
            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_2p5d.apply(correct)
--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/nn/metric/accuracy_3d.py
@ -8,11 +8,7 @@ from ._utils import calc_acc


 class Accuracy3D(nn.Module):
-    """
-    Accuracy for 3D parallelism
-
-    :param logits: predicted labels
-    :param targets: true labels
+    """Accuracy for 3D parallelism
    """
    def __init__(self):
        super().__init__()
@ -20,6 +16,11 @@ class Accuracy3D(nn.Module):
        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)

    def forward(self, logits, targets):
+        """Calculate the accuracy of predicted labels.
+
+         :param logits: Predicted labels
+         :param targets: True labels from data
+         """
        with torch.no_grad():
            correct = calc_acc(logits, targets)
            correct = reduce_by_batch_3d.apply(correct, self.input_parallel_mode, self.weight_parallel_mode)
--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
@ -30,8 +30,8 @@ class FusedLAMB(torch.optim.Optimizer):
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+            numerical stability. (default: 1e-6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            NOT SUPPORTED now! (default: False)
--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
@ -20,7 +20,7 @@ class Lamb(Optimizer):
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
+            numerical stability (default: 1e-6)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        adam (bool, optional): always use trust ratio = 1, which turns this into
            Adam. Useful for comparison purposes.
--- a/colossalai/nn/optimizer/lars.py
+++ b/colossalai/nn/optimizer/lars.py
@ -16,7 +16,7 @@ class Lars(Optimizer):
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
-        lr (float, optional): learning rate
+        lr (float, optional): learning rate (default: 1e-3)
        momentum (float, optional): momentum factor (default: 0)
        eeta (float, optional): LARS coefficient as used in the paper (default: 1e-3)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@ -25,11 +25,13 @@ class Trainer:
    called `Trainer`.

    :param engine: Engine responsible for the process function
-    :param hooks_cfg: The configuration of hooks
-    :param verbose: If True, additional information will be printed
-    :type engine: Engine
-    :type hoooks_cfg: Config, optional
-    :type verbose: bool, optional
+    :type engine: :class:`Engine`
+    :param schedule: Schedule responsible for forward and backward steps
+    :type schedule: :class:`BaseSchedule`, optional
+    :param timer: Timer used to monitor the whole training
+    :type timer: :class:`MultiTimer`, optional
+    :param logger: Logger used to record the whole training
+    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
    """

    def __init__(self,
@ -121,6 +123,8 @@ class Trainer:
        :type action: str
        :param item: Name of the timer
        :type item: str
+        :param args: args used for action function
+        :param kwargs: kwargs used for action function
        """

        if self._timer is not None:
@ -257,18 +261,18 @@ class Trainer:
        :param max_steps: Maximum number of running iterations
        :param test_dataloader: DataLoader in testing
        :param test_interval: Interval of testing
-        :param hooks_cfg: A list of hook configuration
+        :param hooks: A list of hooks used in training
        :param display_progress: If True, the training progress will be printed
        :param return_output_label: If True, the output of model and the label will be returned
-        :type return_output_label: bool
+
        :type train_dataloader: DataLoader
        :type epochs: int
-        :type max_steps: int
-        :type test_dataloader: DataLoader
-        :type test_interval: int
-        :type hooks_cfg: dict
-        :type display_progress: bool
-        :type gradient_accumulation: int
+        :type max_steps: int, optional
+        :type test_dataloader: DataLoader, optional
+        :type test_interval: int, optional
+        :type hooks: list, optional
+        :type display_progress: bool, optional
+        :type return_output_label: bool, optional
        """

        # set epochs and steps, consider gradient accumulation
@ -343,9 +347,12 @@ class Trainer:
        """Evaluates the model with testing data.

        :param test_dataloader: DataLoader in testing
+        :param hooks: A list of hooks used in evaluation
        :param display_progress: If True, the evaluation progress will be printed
        :param return_output_label: If True, the output of model and the label will be returned
+
        :type test_dataloader: DataLoader
+        :type hooks: list, optional
        :type display_progress: bool, optional
        :type return_output_label: bool
        """
--- a/colossalai/trainer/hooks/_base_hook.py
+++ b/colossalai/trainer/hooks/_base_hook.py
@ -12,7 +12,6 @@ class BaseHook(ABC):

    :param priority: Priority in the printing, hooks with small priority will be printed in front
    :type priority: int
-    :param trainer: Trainer attached with current hook
    """

    def __init__(self, priority: int) -> None:
@ -41,6 +40,8 @@ class BaseHook(ABC):
    def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
        """Actions after running a training iteration.

+        :param trainer: Trainer which is using this hook
+        :type trainer: :class:`Trainer`
        :param output: Output of the model
        :type output: torch.Tensor
        :param label: Labels of the input data
@ -88,6 +89,8 @@ class BaseHook(ABC):
    def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor):
        """Actions after running a testing iteration.

+        :param trainer: Trainer which is using this hook
+        :type trainer: :class:`Trainer`
        :param output: Output of the model
        :type output: Tensor
        :param label: Labels of the input data
@ -100,6 +103,8 @@ class BaseHook(ABC):
    def init_runner_states(self, trainer, key, val):
        """Initializes trainer's state.

+        :param trainer: Trainer which is using this hook
+        :type trainer: :class:`Trainer`
        :param key: Key of reseting state
        :param val: Value of reseting state
        """
--- a/colossalai/trainer/hooks/_checkpoint_hook.py
+++ b/colossalai/trainer/hooks/_checkpoint_hook.py
@ -24,7 +24,6 @@ class SaveCheckpointHook(BaseHook):
    :type suffix: str, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
    """

    def __init__(self,
@ -84,7 +83,6 @@ class LoadCheckpointHook(BaseHook):
    :type suffix: str, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
    """

    def __init__(self,
--- a/colossalai/trainer/hooks/_log_hook.py
+++ b/colossalai/trainer/hooks/_log_hook.py
@ -25,15 +25,15 @@ def _format_number(val, prec=5):


 class LogByEpochHook(BaseHook):
-    """hook to log by epoch
+    """Hook to log by epoch

-    :param logger: logger for the log
+    :param logger: Logger for the log
    :param interval: Recording interval, defaults to 1
    :type interval: int, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
    """
+
    def __init__(self,
                 logger,
                 interval: int = 1,
@ -48,12 +48,12 @@ class LogByEpochHook(BaseHook):

@HOOKS.register_module
 class LogMetricByStepHook(BaseHook):
-    """hook to log metric by step
+    """Hook to log metric by step

    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
    """
+
    def __init__(self, priority: int = 10):
        super().__init__(priority)

@ -62,7 +62,7 @@ class LogMetricByStepHook(BaseHook):
        for metric_name, metric_calculator in trainer.states['metrics']['train'].items():
            trainer.states['step_metrics'][metric_name.lower()] = \
                f'{_format_number(metric_calculator.get_last_step_value())}'
-    
+
    def after_test_iter(self, trainer, *args):
        trainer.states['step_metrics'] = dict()
        for metric_name, metric_calculator in trainer.states['metrics']['test'].items():
@ -72,15 +72,13 @@ class LogMetricByStepHook(BaseHook):

@HOOKS.register_module
 class LogMetricByEpochHook(LogByEpochHook):
-    """Specialized Hook to record the metric to log.
+    """Specialized hook to record the metric to log.

-    :param logger: logger for the log
+    :param logger: Logger for the log
    :param interval: Recording interval, defaults to 1
    :type interval: int, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
-    :param mode: Mode of metrics, 'train' and 'test'
    """

    def __init__(self,
@ -116,19 +114,16 @@ class LogMetricByEpochHook(LogByEpochHook):

@HOOKS.register_module
 class TensorboardHook(BaseHook):
-    """Specialized Hook to record the metric to Tensorboard.
+    """Specialized hook to record the metric to Tensorboard.

    :param log_dir: Directory of log
    :type log_dir: str
-    :param ranks: ranks of processors
+    :param ranks: Ranks of processors
    :type ranks: typing.List
    :param parallel_mode: Parallel mode, defaults to colossalai.context.parallel_mode.ParallelMode.GLOBAL
-    :type parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
+    :type parallel_mode: :class:`colossalai.context.parallel_mode.ParallelMode`, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
-    :param mode: Mode of metrics, 'train' and 'test'
-    :type mode: str
    """

    def __init__(self,
@ -203,12 +198,12 @@ class TensorboardHook(BaseHook):

@HOOKS.register_module
 class LogTimingByEpochHook(LogByEpochHook):
-    """Specialized Hook to write timing record to log.
+    """Specialized hook to write timing record to log.

    :param timer: Timer for the hook
-    :type timer: colossalai.utils.MultiTimer
+    :type timer: :class:`colossalai.utils.MultiTimer`
    :param logger: Logger for the log
-    :type logger: colossalai.logging.DistributedLogger
+    :type logger: :class:`colossalai.logging.DistributedLogger`
    :param interval: Recording interval, defaults to 1
    :type interval: int, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 10
@ -217,9 +212,8 @@ class LogTimingByEpochHook(LogByEpochHook):
    :type log_eval: bool, optional
    :param ignore_num_train_steps: Number of training steps to ignore, defaults to 0
    :type ignore_num_train_steps: int, optional
-    :param mode: Mode of metrics, 'train' and 'test'
-    :param trainer: Trainer attached with current hook
    """
+
    def __init__(self,
                 timer: MultiTimer,
                 logger: DistributedLogger,
@ -285,12 +279,13 @@ class LogMemoryByEpochHook(LogByEpochHook):
    :param log_eval: Whether writes in evaluation, defaults to True
    :type log_eval: bool, optional
    """
+
    def __init__(self,
                 logger: DistributedLogger,
                 interval: int = 1,
                 priority: int = 10,
                 log_eval: bool = True,
-                 report_cpu: bool = False, # no reference
+                 report_cpu: bool = False,  # no reference
                 ) -> None:
        super().__init__(logger=logger, interval=interval, priority=priority)
        self._log_eval = log_eval
--- a/colossalai/trainer/hooks/_lr_scheduler_hook.py
+++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py
@ -15,7 +15,6 @@ class LRSchedulerHook(MetricHook):
    :type store_lr_in_state: bool, optional
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 1
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
    """
    def __init__(
        self,
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@ -124,6 +124,7 @@ class LossMetric(Metric):
        """
        return self.last_step_loss

+    @staticmethod
    def is_better(a, b):
        return a < b

@ -133,7 +134,7 @@ class LearningRateMetric(Metric):

    :param epoch_only: Whether the metric only read for the full epoch
    :type epoch_only: bool
-    :param initial_lr: initial learning rate, defaults to 0.0
+    :param initial_lr: Initial learning rate, defaults to 0.0
    :type initial_lr: float, optional
    """

@ -153,6 +154,7 @@ class LearningRateMetric(Metric):
    def get_accumulated_value(self):
        return self.lr

+    @staticmethod
    def is_better(a, b) -> bool:
        pass

@ -163,8 +165,8 @@ class AccuracyMetric(Metric):

    :param epoch_only: Whether the metric only read for the full epoch
    :type epoch_only: bool
-    :param accuracy_func: accuracy function for the classification task
-    :type accuracy_func: typing.Callable
+    :param accuracy_func: Accuracy function for the classification task
+    :type accuracy_func: :class:`typing.Callable`
    """

    def __init__(self, epoch_only: bool, accuracy_func: Callable):
@ -186,8 +188,8 @@ class AccuracyMetric(Metric):
        and labels. It expects the output has logits and labels.

        :param logits: The logits output of the model
-        :param targets: real labels of the dataset
-        :param batch_size: batch size of the task
+        :param targets: Real labels of the dataset
+        :param batch_size: Batch size of the task
        """
        if isinstance(logits, (list, tuple)):
            logits = logits[0]
@ -211,6 +213,7 @@ class AccuracyMetric(Metric):
        self.accumulated_correct = all_reduce(self.accumulated_correct, ParallelMode.DATA)
        return (self.accumulated_correct / self.accumulated_sum).item()

+    @staticmethod
    def is_better(a, b) -> bool:
        return a > b

@ -223,8 +226,6 @@ class MetricHook(BaseHook):

    :param priority: Priority in the printing, hooks with small priority will be printed in front
    :type priority: int
-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
    """

    def __init__(
@ -245,8 +246,6 @@ class LossHook(MetricHook):

    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
    """

    def __init__(self, priority: int = 0):
@ -288,8 +287,6 @@ class AccuracyHook(MetricHook):
    :type accuracy_func: typing.Callable
    :param priority: Priority in the printing, hooks with small priority will be printed in front, defaults to 0
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
    """

    def __init__(self, accuracy_func: Callable, priority: int = 0):
@ -319,8 +316,6 @@ class ThroughputMetric(Metric):

    :param epoch_only: epoch only
    :type epoch_only: bool
-    :param num_samples: number of samples
-    :param time: time
    """
    def __init__(self, epoch_only: bool):
        super().__init__(epoch_only=epoch_only)
@ -353,6 +348,7 @@ class ThroughputMetric(Metric):
        self.accumulated_num_samples = all_reduce(self.accumulated_num_samples, ParallelMode.DATA)
        return (self.accumulated_num_samples / (self.accumulated_used_time + 1e-12)).item()

+    @staticmethod
    def is_better(a, b) -> bool:
        pass

@ -363,8 +359,6 @@ class ThroughputHook(MetricHook):

    :param priority: priority of throughput hook, defaults to 10
    :type priority: int, optional
-    :param trainer: Trainer attached with current hook
-    :type trainer: Trainer
    """
    def __init__(self, priority: int = 10):
        super().__init__(priority)
--- a/colossalai/utils/activation_checkpoint.py
+++ b/colossalai/utils/activation_checkpoint.py
@ -108,10 +108,10 @@ class CheckpointFunction(torch.autograd.Function):


 def checkpoint(function, *args):
-    '''Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
+    """Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint

-    :param function: describe the forward pass function. It should know how to handle the input tuples.
-    :param args: tuple containing inputs to the function
-    :return: Output of running function on \*args
-    '''
+    :param function: Describe the forward pass function. It should know how to handle the input tuples.
+    :param args: Tuple containing the parameters of the function
+    :return: Output of running function with provided args
+    """
    return CheckpointFunction.apply(function, *args)
--- a/colossalai/utils/checkpointing.py
+++ b/colossalai/utils/checkpointing.py
@ -19,9 +19,8 @@ __all__ = [


 def unwrap_config(config: Config):
-    '''
-    unwrap Config objects to normal dicts
-    '''
+    """Unwrap Config objects to normal dicts
+    """
    config_dict = dict()
    for k, v in config.items():
        if isinstance(v, dict):
@ -53,18 +52,18 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):


 def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
-    '''This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
+    """This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
    This is useful during generation and recuperation of the checkpoint.

-    :param checkpoint_dir: set up a directory for saving checkpoints
+    :param checkpoint_dir: Set up a directory for saving checkpoints
    :type checkpoint_dir: str
-    :param epoch: epoch number (indicate how many epochs have you trained this model)
+    :param epoch: Epoch number (indicate how many epochs have you trained this model)
    :type epoch: int
-    :param suffix: additional notation to specify the model or checkpoint, defaults to ''
+    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
    :type suffix: str, optional
-    :return: checkpoint path to be generated 
+    :return: Checkpoint path to be generated
    :rtype: path
-    '''
+    """
    ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
    return os.path.join(checkpoint_dir, ckpt_filename)

@ -77,30 +76,30 @@ def _ensure_directory_exists(filename: str):


 def get_latest_checkpoint_pattern(suffix: str = ''):
-    '''Generate Regular expression of latest checkpoint's pattern
+    """Generate Regular expression of latest checkpoint's pattern

-    :param suffix: additional notation to specify the model or checkpoint, defaults to ''
+    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
    :type suffix: str, optional
-    :return: checkpoint pattern
+    :return: Checkpoint pattern
    :rtype: regular expression
-    '''
+    """
    ranks_name = _get_ranks_name()
    ckpt_pattern = re.compile(f'epoch(\d+)-{ranks_name}{suffix}\.pt')
    return ckpt_pattern


 def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
-    '''This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
+    """This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
    This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.

-    :param checkpoint_dir: directory for saving checkpoints
+    :param checkpoint_dir: Directory for saving checkpoints
    :type checkpoint_dir: str
-    :param suffix: additional notation to specify the model or checkpoint, defaults to ''
+    :param suffix: Additional notation to specify the model or checkpoint, defaults to ''
    :type suffix: str, optional
-    :raises FileNotFoundError: raise error when we cannot find the latest checkpoint file with inputs given
-    :return: the latest checkpoint path to be retrieved 
+    :raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
+    :return: The latest checkpoint path to be retrieved
    :rtype: path
-    '''
+    """
    CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)

    last_epoch = -1
@ -128,22 +127,22 @@ def save_checkpoint(checkpoint_path: str,
                    optimizer: torch.optim.Optimizer,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    **kwargs):
-    '''Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model, optimizer, lr_scheduler and etc. into a checkpoint dictionary. 
-    
+    """Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model, optimizer, lr_scheduler and etc. into a checkpoint dictionary.
+
    This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.


-    :param checkpoint_path: set up a directory for saving checkpoints
+    :param checkpoint_path: Set up a directory for saving checkpoints
    :type checkpoint_path: str
-    :param epoch: epoch number (indicate how many epochs have you trained this model)
+    :param epoch: Epoch number (indicate how many epochs have you trained this model)
    :type epoch: int
-    :param model: model to be registered
+    :param model: Model to be registered
    :type model: torch.nn.Module
-    :param optimizer: optimizer to be registered
+    :param optimizer: Optimizer to be registered
    :type optimizer: torch.optim.Optimizer
    :param lr_scheduler: lr_scheduler to be registered, defaults to None
    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
-    '''
+    """
    # for compatibility with normal pytorch nn.Module
    if hasattr(model, 'state_dict_for_save_checkpoint'):
        model_sd = model.state_dict_for_save_checkpoint()
@ -170,31 +169,31 @@ def load_checkpoint(checkpoint_path: str,
                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
                    finetune: bool = False,
                    strict: bool = True) -> Tuple:
-    '''Loads the checkpoint file. 
-    If finetune is False, then we intend to continue/resume the training process from the checkpoint given. 
-    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler) and its descendants. 
+    """Loads the checkpoint file.
+    If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
+    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler) and its descendants.
    If finetune is True, then only the weights and buffers of model should be reload.
    If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s state_dict() function.
-    
-    :param checkpoint_path: the exact and matched checkpoint_path directory to retrieve appropriate state_dict
+
+    :param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
    :type checkpoint_path: str
-    :param model: model to reload parameters and buffers
+    :param model: Model to reload parameters and buffers
    :type model: torch.nn.Module
-    :param optimizer: optimizer to recuperate
-    :type optimizer: torch.optim.Optimizer 
+    :param optimizer: Optimizer to recuperate
+    :type optimizer: torch.optim.Optimizer
    :param lr_scheduler: lr_scheduler to recuperate, defaults to None
    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
-    :param finetune: whether to finetune the model with new dataset or continue the pre-training, defaults to False
+    :param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
    :type finetune: bool, optional
-    :param strict: whether to strictly enforce that the keys in
+    :param strict: Whether to strictly enforce that the keys in
        :attr:`state_dict` of the checkpoint match the names of
        parameters and buffers in model., defaults to True
    :type strict: bool, optional
-    :raises ValueError: raise error if the model/optimizer cannot successfully be recuperated
+    :raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
    :return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
    :rtype: Tuple

-    '''
+    """
    # Load the checkpoint.
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    try:
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@ -6,6 +6,8 @@ import socket
 import torch
 from torch._six import inf

+import colossalai.context.parallel_mode
+
 try:
    import colossal_C
 except:
@ -23,11 +25,13 @@ from .multi_tensor_apply import multi_tensor_applier


 def print_rank_0(msg: str, logger=None):
-    '''Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
+    """Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.

-    :param msg: A str message to output
-    :param logger: python logger object, defaults to None
-    '''
+    :param msg: A string message to output
+    :type msg: str
+    :param logger: Python logger object, defaults to None
+    :type logger: optional
+    """
    if gpc.get_global_rank() == 0:
        if logger is None:
            print(msg, flush=True)
@ -48,10 +52,13 @@ def free_port():


 def sync_model_param(model, parallel_mode):
-    '''Make sure data parameters are consistent during Data Parallel Mode
+    """Make sure data parameters are consistent during Data Parallel Mode

    :param model: A pyTorch nn.model on whose parameters you check the consistency
-    '''
+    :param parallel_mode: Parallel mode to be checked
+    :type model: torch.nn.Module
+    :type parallel_mode:  colossalai.context.ParallelMode
+    """
    if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
        for param in model.parameters():
            ranks = gpc.get_ranks_in_group(parallel_mode)
@ -124,18 +131,17 @@ def _calc_lp(grads, norm_type):


 def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
-    """Clips gradient norm of an iterable of parameters whose gradients
-       are in fp32.
+    """Clips gradient norm of an iterable of parameters whose gradients are in fp32.

-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
    added functionality to handle model parallel parameters. Note that
    the gradients are modified in place.

-    :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
+    :param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
    :type parameters: (Iterable[Tensor] or Tensor)
-    :param max_norm: max norm of the gradients
+    :param max_norm: Max norm of the gradients
    :type max_norm: float or int
-    :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
+    :param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
    :type norm_type: float or int 

    :return: Total norm of the parameters (viewed as a single vector).
--- a/colossalai/utils/cuda.py
+++ b/colossalai/utils/cuda.py
@ -5,10 +5,10 @@ import torch


 def set_to_cuda(models):
-    '''Send model to gpu.
+    """Send model to gpu.

    :param models: nn.module or a list of module
-    '''
+    """
    if isinstance(models, list) and len(models) > 1:
        ret = []
        for model in models:
@ -21,9 +21,8 @@ def set_to_cuda(models):


 def get_current_device():
-    '''
-    Returns the index of a currently selected device (gpu/cpu).
-    '''
+    """Returns the index of a currently selected device (gpu/cpu).
+    """
    if torch.cuda.is_available():
        return torch.cuda.current_device()
    else:
@ -31,18 +30,16 @@ def get_current_device():


 def synchronize():
-    '''
-    Similar to cuda.synchronize().
+    """Similar to cuda.synchronize().
    Waits for all kernels in all streams on a CUDA device to complete.
-    '''
+    """
    if torch.cuda.is_available():
        torch.cuda.synchronize()


 def empty_cache():
-    '''
-    Similar to cuda.empty_cache()
+    """Similar to cuda.empty_cache()
    Releases all unoccupied cached memory currently held by the caching allocator.
-    '''
+    """
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
--- a/colossalai/utils/data_sampler/data_parallel_sampler.py
+++ b/colossalai/utils/data_sampler/data_parallel_sampler.py
@ -21,13 +21,15 @@ T_co = TypeVar('T_co', covariant=True)
 class DataParallelSampler(Sampler):
    """A data sampler for distributed data parallelism

-    :param dataset: a Dataset instance
+    :param dataset: A Dataset instance
    :type dataset: torch.utils.data.Dataset
-    :param shuffle: whether to shuffle data, defaults to False
+    :param shuffle: Whether to shuffle data, defaults to False
    :type shuffle: bool, optional
-    :param seed: the random seed, defaults to 0
+    :param seed: The random seed, defaults to 0
    :type seed: int, optional
-    :param drop_last: set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller, defaults to False
+    :param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
+        size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
+        defaults to False
    :type drop_last: bool, optional
    """

@ -116,19 +118,18 @@ def get_dataloader(dataset,
                   pin_memory=False,
                   num_workers=0,
                   **kwargs):
-    '''Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
+    """Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)

-    .. note: when pipeline parallel is enabled, shuffle cannot be True
-        as it will result in mismatch between input data on the 1st
-        stage and label on the last stage
+    .. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
+        on the 1st stage and label on the last stage

-    :param dataset: a :class:utils.data.dataset dataset
-    :param shuffle: whether to shuffle the dataset
-    :param seed: random worker seed, defaults to 1024
-    :param add_sampler: add DistributedDataParallelSampelr to the dataset
-    :param drop_last: drop the last incomplete batch of data
-    :param pin_memory: whether to pin memory address in CPU memory
-    :param num_workers: number of worker threads for this dataloader
+    :param dataset: A :class:`utils.data.dataset dataset`
+    :param shuffle: Whether to shuffle the dataset
+    :param seed: Random worker seed, defaults to 1024
+    :param add_sampler: Add DistributedDataParallelSampelr to the dataset
+    :param drop_last: Drop the last incomplete batch of data
+    :param pin_memory: Whether to pin memory address in CPU memory
+    :param num_workers: Number of worker threads for this dataloader

    :type dataset: :class:`torch.utils.data.Dataset`
    :type shuffle: bool, optional. Default is False
@ -138,9 +139,9 @@ def get_dataloader(dataset,
    :type pin_memory: bool, optional. Default is False
    :type num_workers: int, optional. Default is 0

-    :return: a object of :class:`torch.utils.data.DataLoader`
+    :return: A object of :class:`torch.utils.data.DataLoader`
    :rtype: :class:`torch.utils.data.DataLoader`
-    '''
+    """
    _kwargs = kwargs.copy()

    if add_sampler and gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1:
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@ -17,11 +17,11 @@ class GradAccumOptimizer(ColossalaiOptimizer):
    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
    before accumulation size is reached

-    :param optim: your optimizer object
+    :param optim: Your optimizer object
    :type optim: :class:`torch.optim.Optimizer`
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumualate_size: int
-    :param model: your model object to check if it is DDP for special handling of no_sync() context
+    :param accumulate_size: The number of steps to accumulate gradients
+    :type accumulate_size: int
+    :param model: Your model object to check if it is DDP for special handling of no_sync() context
    :type model: :class:`torch.nn.Module`

    """
@ -75,7 +75,7 @@ class GradAccumOptimizer(ColossalaiOptimizer):
            self.optim.backward_by_grad(tensor, grad)


-class GradAccumDataloader():
+class GradAccumDataloader:
    """A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.

    For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will 
@ -83,10 +83,10 @@ class GradAccumDataloader():
    Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader, 
    (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.

-    :param dataloader: your dataloader object
+    :param dataloader: Your dataloader object
    :type dataloader: Iterable
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumualate_size: int
+    :param accumulate_size: The number of steps to accumulate gradients
+    :type accumulate_size: int

    """

@ -127,10 +127,10 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
    before accumulation size is reached

-    :param lr_scheduler: your lr scheduler object
+    :param lr_scheduler: Your lr scheduler object
    :type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`    
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumualate_size: int
+    :param accumulate_size: The number of steps to accumulate gradients
+    :type accumulate_size: int

    """

@ -170,14 +170,14 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
        self.lr_scheduler.load_state_dict(state_dict)


-class GradAccumGradientHandler():
+class GradAccumGradientHandler:
    """A wrapper for the gradient handler to enable gradient accumulation by skipping the steps 
    before accumulation size is reached

-    :param grad_handler: your gradient handler object
+    :param grad_handler: Your gradient handler object
    :type grad_handler: :class:`colossalai.engine.BaseGradientHandler`    
-    :param accumulate_size: the number of steps to accumulate gradients
-    :type accumualate_size: int
+    :param accumulate_size: The number of steps to accumulate gradients
+    :type accumulate_size: int

    """

--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
@ -12,34 +12,36 @@ from colossalai.logging import get_dist_logger


 def bytes_to_GB(val, decimal=2):
-    '''A byte-to-Gigabyte converter, defaultly using binary notation.
+    """A byte-to-Gigabyte converter, defaultly using binary notation.

    :param val: X bytes to convert
    :return: X' GB
-    '''
+    """
    return round(val / (1024 * 1024 * 1024), decimal)


 def bytes_to_MB(val, decimal=2):
-    '''A byte-to-Megabyte converter, defaultly using binary notation.
+    """A byte-to-Megabyte converter, defaultly using binary notation.

    :param val: X bytes to convert
    :return: X' MB
-    '''
+    """
    return round(val / (1024 * 1024), decimal)


 def report_memory_usage(message, logger=None, report_cpu=False):
-    '''Calculate and print RAM usage (in GB)
+    """Calculate and print RAM usage (in GB)

-    :param message: a prefix message to add in the log
+    :param message: A prefix message to add in the log
    :type message: str
-    :param logger: an instance of :class:`colossalai.logging.DistributedLogger`
-    :type logger: :class:`colossalai.logging.DistributedLogger`
-    :param report_cpu: whether to report CPU memory
-    :type report_cpu: bool
-    :raises EnvironmentError: raise error if no distributed environment has been initialized
-    '''
+    :param logger: An instance of :class:`colossalai.logging.DistributedLogger`
+    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
+    :param report_cpu: Whether to report CPU memory
+    :type report_cpu: bool, optional
+    :raises EnvironmentError: Raise error if no distributed environment has been initialized
+    """
+    if not gpc.is_initialized(ParallelMode.GLOBAL):
+        raise EnvironmentError("No distributed environment is initialized")

    gpu_allocated = bytes_to_MB(torch.cuda.memory_allocated())
    gpu_max_allocated = bytes_to_MB(torch.cuda.max_memory_allocated())
--- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
+++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py
@ -5,7 +5,7 @@ class MultiTensorApply(object):
    """
    Apply an operation to a list of tensors efficiently

-    :param chunk_size: size of a chunk
+    :param chunk_size: Size of a chunk
    :type chunk_size: int
    """

@ -22,7 +22,7 @@ class MultiTensorApply(object):
            MultiTensorApply.import_err = err

    def check_avail(self):
-        if MultiTensorApply.available == False:
+        if not MultiTensorApply.available:
            raise RuntimeError(
                "Attempted to call MultiTensorApply method, but MultiTensorApply "
                "is not available, possibly because Apex was installed without "
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@ -6,9 +6,8 @@ from .cuda import synchronize


 class Timer:
-    '''
-    A timer object which helps to log the execution times, and provides different tools to assess the times.
-    '''
+    """A timer object which helps to log the execution times, and provides different tools to assess the times.
+    """

    def __init__(self):
        self._started = False
@ -21,20 +20,21 @@ class Timer:
        return len(self._history) != 0

    def start(self):
-        '''Fisrtly synchronize cuda, reset the clock and then start the timer.
-        '''
+        """Fisrtly synchronize cuda, reset the clock and then start the timer.
+        """
        self._elapsed = 0
        synchronize()
        self._start_time = time.time()
        self._started = True

    def stop(self, keep_in_history: bool = False):
-        '''Stop the timer and record the start-stop time interval. 
-        :param keep_in_history: whether does it record into history each start-stop interval, defaults to False
+        """Stop the timer and record the start-stop time interval.
+
+        :param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
        :type keep_in_history: bool, optional
-        :return: start-stop interval
+        :return: Start-stop interval
        :rtype: int
-        '''
+        """
        synchronize()
        end_time = time.time()
        elapsed = end_time - self._start_time
@ -45,79 +45,90 @@ class Timer:
        return elapsed

    def get_history_mean(self):
-        '''mean of all history start-stop time intervals.
-        :return: mean of time intervals
+        """Mean of all history start-stop time intervals.
+
+        :return: Mean of time intervals
        :rtype: int
-        '''
+        """
        return sum(self._history) / len(self._history)

    def get_history_sum(self):
-        '''add up all the start-stop time intervals.
-        :return: sum of time intervals
+        """Add up all the start-stop time intervals.
+
+        :return: Sum of time intervals
        :rtype: int
-        '''
+        """
        return sum(self._history)

    def get_elapsed_time(self):
-        '''return the last start-stop time interval. *use it only when timer is not in progress*
-        :return: the last time interval
+        """Return the last start-stop time interval.
+
+        .. note:: Use it only when timer is not in progress
+
+        :return: The last time interval
        :rtype: int
-        '''
+        """
        assert not self._started, 'Timer is still in progress'
        return self._elapsed

    def reset(self):
-        '''clear up the timer and its history
-        '''
+        """Clear up the timer and its history
+        """
        self._history = []
        self._started = False
        self._elapsed = 0


 class MultiTimer:
-    '''An object contains multiple timers
+    """An object contains multiple timers

-    :param on: whether the timer is enabled. Default is True
-    :type on: bool
-    '''
+    :param on: Whether the timer is enabled. Default is True
+    :type on: bool, optional
+    """

    def __init__(self, on: bool = True):
        self._on = on
        self._timers = dict()

    def start(self, name: str):
-        '''Start namely one of the timers
-        :param name: timer's key
+        """Start namely one of the timers
+
+        :param name: Timer's key
        :type name: str
-        '''
+        """
        if self._on:
            if name not in self._timers:
                self._timers[name] = Timer()
            return self._timers[name].start()

    def stop(self, name: str, keep_in_history: bool):
-        '''Stop namely one of the timers.
-        :param name: timer's key
-        :param keep_in_history: whether does it record into history each start-stop interval
+        """Stop namely one of the timers.
+
+        :param name: Timer's key
+        :type name: str
+        :param keep_in_history: Whether does it record into history each start-stop interval
        :type keep_in_history: bool
-        '''
+        """
        if self._on:
            return self._timers[name].stop(keep_in_history)
        else:
            return None

    def get_timer(self, name):
-        '''Get timer by its name (from multitimer)
-        :param name: timer's key
-        :return: timer with the name you give correctly
-        :rtype: Timer 
-        '''
+        """Get timer by its name (from multitimer)
+
+        :param name: Timer's key
+        :return: Timer with the name you give correctly
+        :rtype: Timer
+        """
        return self._timers[name]

    def reset(self, name=None):
-        '''Reset timers.
-        :param name: if name is designated, the named timer will be reset and others will not, defaults to None
-        '''
+        """Reset timers.
+
+        :param name: If name is designated, the named timer will be reset and others will not, defaults to None
+        :type name: optional
+        """
        if self._on:
            if name is not None:
                self._timers[name].reset()
--- a/colossalai/zero/init.py
+++ b/colossalai/zero/init.py
@ -13,17 +13,17 @@ from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3
 def convert_to_zero(model: nn.Module,
                    optimizer: Optimizer,
                    level: int,
-                    zero_config):
+                    zero_config: dict):
    """
    A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading

-    :param model: your model object
+    :param model: Your model object
    :type model: :class:`torch.nn.Module`
-    :param optimizer: your optimizer object
+    :param optimizer: Your optimizer object
    :type optimizer: :class:`torch.optim.Optimizer`
-    :param level: optimizer level, can be 2 or 3
+    :param level: Optimizer level, can be 2 or 3
    :type level: int
-    :param zero_config: configuration for zero
+    :param zero_config: Configuration for zero
    :type zero_config: dict

    :return: (model, optimizer)
--- a/colossalai/zero/loss_scaler.py
+++ b/colossalai/zero/loss_scaler.py
@ -95,8 +95,11 @@ class DynamicLossScaler(LossScalerBase):
    always using the highest loss scale possible without incurring overflow.
    Args:
        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
+        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is
+            encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive
+            iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
+        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before
+            increasing the loss scale.
    """

    def __init__(self,