Optimize pipeline schedule (#94)

* add pipeline shared module wrapper and update load batch * added model parallel process group for amp and clip grad (#86) * added model parallel process group for amp and clip grad * update amp and clip with model parallel process group * remove pipeline_prev/next group (#88) * micro batch offload * optimize pipeline gpu memory usage * pipeline can receive tensor shape (#93) * optimize pipeline gpu memory usage * fix grad accumulation step counter * rename classes and functions Co-authored-by: Frank Lee <somerlee.9@gmail.com>
2021-12-30 15:56:46 +08:00 · 2021-12-30 15:56:46 +08:00 · 96780e6ee4
parent e5b9f9a08d
commit 96780e6ee4
29 changed files with 423 additions and 290 deletions
--- a/colossalai/amp/naive_amp/_fp16_optimizer.py
+++ b/colossalai/amp/naive_amp/_fp16_optimizer.py
@ -359,12 +359,7 @@ class FP16Optimizer(Optimizer):
        # Update across all model parallel instances.
        torch.distributed.all_reduce(self.found_inf,
                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=gpc.get_group(ParallelMode.TENSOR))
-
-        if is_using_pp():
-            torch.distributed.all_reduce(self.found_inf,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=gpc.get_group(ParallelMode.PIPELINE))
+                                     group=gpc.get_group(ParallelMode.MODEL))

        # Check for nan.
        found_inf_flag = (self.found_inf.item() > 0)
--- a/colossalai/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/amp/torch_amp/_grad_scaler.py
@ -11,6 +11,7 @@ from typing import Any, Dict, List, Optional, Tuple
 from colossalai.context import ParallelMode
 import torch.distributed as dist
 from colossalai.core import global_context as gpc
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors


 class _MultiDeviceReplicator(object):
@ -247,10 +248,14 @@ class GradScaler(object):
                                                                         device),
                                                                     per_device_inv_scale.get(device))
        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
-        if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            for tensor in per_device_found_inf._per_device_tensors.values():
-                dist.all_reduce(tensor, op=dist.ReduceOp.MAX,
-                                group=gpc.get_group(ParallelMode.TENSOR))
+        if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
+            vals = [val for val in per_device_found_inf._per_device_tensors.values()]
+            coalesced = _flatten_dense_tensors(vals)
+            dist.all_reduce(coalesced,
+                            op=dist.ReduceOp.MAX,
+                            group=gpc.get_group(ParallelMode.MODEL))
+            for buf, synced in zip(vals, _unflatten_dense_tensors(coalesced, vals)):
+                buf.copy_(synced)
        return per_device_found_inf._per_device_tensors

    def unscale_(self, optimizer):
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@ -112,7 +112,7 @@ def _binary_search(weights, num):
    return intervals


-def _partition_uniform(num_items, pipeline_parallel_size, num_chunks):
+def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
    assert num_items % num_chunks == 0, \
        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"

@ -134,11 +134,11 @@ def _partition_uniform(num_items, pipeline_parallel_size, num_chunks):
    return parts


-def _partition_balanced(weights, pipeline_parallel_size, num_chunks):
+def partition_balanced(weights, pipeline_parallel_size, num_chunks):
    num_total = pipeline_parallel_size * num_chunks
    num_items = len(weights)
    if num_items <= num_total:
-        return _partition_uniform(num_items, pipeline_parallel_size, num_chunks)
+        return partition_uniform(num_items, pipeline_parallel_size, num_chunks)

    intervals = _binary_search(weights, num_total)

@ -151,7 +151,7 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):
    return parts


-def _count_layer_params(layers):
+def count_layer_params(layers):
    """Count the number of parameters in each layer
    """
    param_counts = [0] * len(layers)
@ -201,11 +201,11 @@ def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method:
    # Make a partition
    if method == 'layer':
        num_layers = len(layers)
-        parts = _partition_uniform(num_layers, pipeline_parallel_size, num_chunks)
+        parts = partition_uniform(num_layers, pipeline_parallel_size, num_chunks)
    elif method == 'parameter':
-        param_counts = _count_layer_params(layers)
+        param_counts = count_layer_params(layers)
        # print_rank_0(param_counts)
-        parts = _partition_balanced(param_counts, pipeline_parallel_size, num_chunks)
+        parts = partition_balanced(param_counts, pipeline_parallel_size, num_chunks)
    else:
        raise ValueError("Method should be a pre-set string in [layer, parameter]")

@ -250,7 +250,7 @@ def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bo
    """
    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-    partitions = _partition_uniform(len(layers), pipeline_parallel_size, num_chunks)
+    partitions = partition_uniform(len(layers), pipeline_parallel_size, num_chunks)
    module_list = []
    for start, end in partitions[pipeline_rank]:
        module_list.append(nn.Sequential(*layers[start:end]))
--- a/colossalai/constants.py
+++ b/colossalai/constants.py
@ -14,7 +14,8 @@ INITIALIZER_MAPPING = {
    '2d': 'Initializer_2D',
    '2.5d': 'Initializer_2p5D',
    '3d': 'Initializer_3D',
-    'sequence': 'Initializer_Sequence'
+    'sequence': 'Initializer_Sequence',
+    'model': 'Initializer_Model'
 }

 # 1D parallel
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@ -394,6 +394,9 @@ class ParallelContext:
        # LSG: init data parallel process group for compatibility with other parallel module such as zero
        pg_init.append(dict(type=INITIALIZER_MAPPING['data']))

+        # LSG: init model parallel process group for compatibility with amp and clip grad
+        pg_init.append(dict(type=INITIALIZER_MAPPING['model']))
+
        if self.pipeline_parallel_size > 1:
            pg_init.append(dict(type=INITIALIZER_MAPPING['pipeline']))
        pg_init.append(dict(type=INITIALIZER_MAPPING['tensor']))
--- a/colossalai/context/parallel_mode.py
+++ b/colossalai/context/parallel_mode.py
@ -14,10 +14,12 @@ class ParallelMode(Enum):
    # common parallel
    DATA = 'data'

+    # model parallel - containing tensor and pipeline parallel groups
+    # this is added to facilitate amp and grad clipping in hybrid parallel
+    MODEL = 'model'
+
    # pipeline parallel
    PIPELINE = 'pipe'
-    PIPELINE_PREV = 'pipe_prev'
-    PIPELINE_NEXT = 'pipe_next'

    # containing all ranks in tensor parallel
    TENSOR = 'tensor'
--- a/colossalai/context/process_group_initializer/init.py
+++ b/colossalai/context/process_group_initializer/init.py
@ -6,10 +6,11 @@ from .initializer_data import Initializer_Data
 from .initializer_pipeline import Initializer_Pipeline
 from .initializer_sequence import Initializer_Sequence
 from .initializer_tensor import Initializer_Tensor
+from .initializer_model import Initializer_Model
 from .process_group_initializer import ProcessGroupInitializer

 __all__ = [
    'Initializer_Tensor', 'Initializer_Sequence', 'Initializer_Pipeline',
    'Initializer_Data', 'Initializer_2p5D', 'Initializer_2D', 'Initializer_3D',
-    'Initializer_1D', 'ProcessGroupInitializer'
+    'Initializer_1D', 'ProcessGroupInitializer', 'Initializer_Model'
 ]
--- a/colossalai/context/process_group_initializer/initializer_model.py
+++ b/colossalai/context/process_group_initializer/initializer_model.py
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch.distributed as dist
+
+from colossalai.context import Config
+from colossalai.registry import DIST_GROUP_INITIALIZER
+from .process_group_initializer import ProcessGroupInitializer
+from ..parallel_mode import ParallelMode
+
+
+@DIST_GROUP_INITIALIZER.register_module
+class Initializer_Model(ProcessGroupInitializer):
+    '''A ProcessGroupInitializer for model parallelism (model parallel group contains pipeline and tensor parallel groups).
+    '''
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model_parallel_size = self.tensor_parallel_size * self.pipeline_parallel_size
+        self.num_group = self.world_size // self.model_parallel_size
+
+    def init_dist_group(self):
+        '''Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
+
+        :return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
+        :rtype: tuple
+        '''
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        group_world_size = None
+        mode = ParallelMode.MODEL
+
+        for i in range(self.num_group):
+            ranks = [i * self.model_parallel_size + j for j in range(self.model_parallel_size)]
+            group = dist.new_group(ranks)
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                ranks_in_group = ranks
+        return local_rank, group_world_size, process_group, ranks_in_group, mode
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@ -36,28 +36,4 @@ class Initializer_Pipeline(ProcessGroupInitializer):
                               process_group, ranks_in_group,
                               ParallelMode.PIPELINE)))

-                for k in range(pipe_group_size):
-                    first = pipe_ranks[k]
-                    second = pipe_ranks[(k + 1) % pipe_group_size]
-                    ranks = [first, second]
-                    group = dist.new_group(ranks)
-                    if self.rank == first:
-                        local_rank = 0
-                        group_world_size = 2
-                        process_group = group
-                        ranks_in_group = ranks
-                        dist_settings.append(
-                            tuple((local_rank, group_world_size,
-                                   process_group, ranks_in_group,
-                                   ParallelMode.PIPELINE_NEXT)))
-                    elif self.rank == second:
-                        local_rank = 1
-                        group_world_size = 2
-                        process_group = group
-                        ranks_in_group = ranks
-                        dist_settings.append(
-                            tuple((local_rank, group_world_size,
-                                   process_group, ranks_in_group,
-                                   ParallelMode.PIPELINE_PREV)))
-
        return dist_settings
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@ -2,15 +2,12 @@
 # -*- encoding: utf-8 -*-


-import torch
 from typing import List
 from torch.nn import Module
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer

-from colossalai.builder import build_gradient_handler
 from colossalai.logging import get_dist_logger
-from colossalai.utils import is_using_ddp, is_using_pp
 from torch import Tensor


--- a/colossalai/engine/gradient_handler/init.py
+++ b/colossalai/engine/gradient_handler/init.py
@ -1,5 +1,7 @@
 from ._base_gradient_handler import BaseGradientHandler
 from ._data_parallel_gradient_handler import DataParallelGradientHandler
 from ._zero_gradient_handler import ZeROGradientHandler
+from ._pipeline_parallel_gradient_handler import PipelineSharedModuleGradientHandler

-__all__ = ['BaseGradientHandler', 'DataParallelGradientHandler', 'ZeROGradientHandler']
+__all__ = ['BaseGradientHandler', 'DataParallelGradientHandler',
+           'ZeROGradientHandler', 'PipelineSharedModuleGradientHandler']
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from colossalai.core import global_context as gpc
+from colossalai.registry import GRADIENT_HANDLER
+from ._base_gradient_handler import BaseGradientHandler
+from collections import defaultdict
+
+
+@GRADIENT_HANDLER.register_module
+class PipelineSharedModuleGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in sub parallel groups.
+    A all-reduce collective communication will be operated in 
+    :func:`handle_gradient` among all sub pipeline parallel groups.
+    For better performance, it bucketizes the gradients of all parameters that are 
+    the same type to improve the efficiency of communication.
+    """
+
+    def handle_gradient(self):
+        """A method running a all-reduce operation in sub pipeline parallel groups.
+        """
+        if gpc.pipeline_parallel_size > 1:
+            # bucketize and all-reduce
+            buckets = defaultdict(lambda: defaultdict(list))
+            # Pack the buckets.
+            for param in self._model.parameters():
+                group = getattr(param, 'pipeline_shared_module_pg', None)
+                if param.requires_grad and param.grad is not None and group is not None:
+                    tp = param.data.type()
+                    buckets[group][tp].append(param)
+
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for group, group_buckets in buckets.items():
+                for tp, bucket in group_buckets.items():
+                    grads = [param.grad.data for param in bucket]
+                    coalesced = _flatten_dense_tensors(grads)
+                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@ -5,8 +5,7 @@ from abc import ABC, abstractmethod

 import torch

-from torch import Tensor
-from typing import Iterable, Union, List, Callable
+from typing import Iterable,  Callable
 from .._base_engine import Engine
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
@ -32,18 +31,17 @@ class BaseSchedule(ABC):
        return element

    def _move_to_device(self, data):
-        if isinstance(data, (tuple, list)):
-            data = tuple([self._move_tensor(d) for d in data])
-        elif torch.is_tensor(data):
-            data = data.to(get_current_device()).detach()
+        if isinstance(data, dict):
+            data = {k: self._move_tensor(v) for k, v in data.items()}
+        else:
+            data = self._move_tensor(data)
        return data

-    def _to_list(self, data):
-        if torch.is_tensor(data):
-            return [data]
-        return data
+    @staticmethod
+    def _check_sanity(data, tag):
+        assert isinstance(data, (torch.Tensor, dict)), f'{tag} must be torch.Tensor or dict'

-    def load_batch(self, data_iter):
+    def load_batch(self, data_iter, to_gpu=True):
        """Loads a batch from data iterator. It returns the data and labels which are
        already in the same GPU as where the model's.

@ -58,13 +56,17 @@ class BaseSchedule(ABC):
            data, label = self.batch_data_process_func(batch_data)
        else:
            data, label = batch_data
-
-        if isinstance(label, (tuple, list)):
-            self.batch_size = label[0].size(0)
+        self._check_sanity(data, 'data')
+        self._check_sanity(label, 'label')
+        if isinstance(data, torch.Tensor):
+            self.batch_size = data.size(0)
        else:
-            self.batch_size = label.size(0)
-        data, label = self._to_list(split_batch(data)), self._to_list(split_batch(label))
+            self.batch_size = next(iter(data.values())).size(0)    
+        data, label = split_batch(data), split_batch(label)
+        if to_gpu:
            return self._move_to_device(data), self._move_to_device(label)
+        return data, label
+

    def pre_processing(self, engine: Engine):
        """To perform actions before running the schedule.
@ -76,7 +78,8 @@ class BaseSchedule(ABC):
                              engine: Engine,
                              data_iter: Iterable,
                              forward_only: bool,
-                              return_loss: bool = True
+                              return_loss: bool = True,
+                              return_output_label: bool = True
                              ):
        """The process function over a batch of dataset for training or evaluation.

@ -85,5 +88,24 @@ class BaseSchedule(ABC):
        :param labels: ground truth
        :param forward_only: If True, the process won't include backward
        :param return_loss: If False, the loss won't be returned
+        :param return_output_label: If False, the output and label won't be returned
        """
        pass
+
+    @staticmethod
+    def _call_engine(engine, inputs):
+        if isinstance(inputs, torch.Tensor):
+            return engine(inputs)
+        else:
+            return engine(**inputs)
+
+    @staticmethod
+    def _call_engine_criterion(engine, outputs, labels):
+        assert isinstance(outputs, (torch.Tensor, list, tuple)
+                          ), f'Expect output of model is (torch.Tensor, list, tuple), got {type(outputs)}'
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs, )
+        if isinstance(labels, torch.Tensor):
+            return engine.criterion(*outputs, labels)
+        else:
+            return engine.criterion(*outputs, **labels)
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@ -5,9 +5,7 @@ from typing import Iterable

 import torch

-import torch.nn as nn
 from colossalai.engine import Engine
-from torch.optim import Optimizer
 from ._base_schedule import BaseSchedule
 from colossalai.utils import conditional_context

@ -27,17 +25,20 @@ class NonPipelineSchedule(BaseSchedule):
                              engine: Engine,
                              data_iter: Iterable,
                              forward_only: bool = False,
-                              return_loss: bool = True):
+                              return_loss: bool = True,
+                              return_output_label: bool = True):
        """The process function that loads loads a batch of dataset and feeds it to the model.
        The returned labels and loss will None if :attr:`return_loss` is False.
        :param engine: Model for training and inference
        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
        :param return_loss: Loss will be returned if True
+        :param return_output_label: Output and label will be returned if True
        :type engine: Iterator
        :type data_iter: Iterator
        :type forward_only: bool, optional
        :type return_loss: bool, optional
+        :type return_output_label: bool, optional

        :return: (output, label, loss)
        :rtype: Tuple[:class:`torch.Tensor`]
@ -48,16 +49,20 @@ class NonPipelineSchedule(BaseSchedule):

        # forward
        with conditional_context(torch.no_grad(), enable=forward_only):
-            output = engine(*data)
-            if not isinstance(output, (tuple, list)):
-                output = (output,)
+            output = self._call_engine(engine, data)
            if return_loss:
-                loss = engine.criterion(*output, *label)
+                loss = self._call_engine_criterion(engine, output, label)

        if not forward_only:
            engine.backward(loss)

+        if return_output_label:
            if return_loss:
                return output, label, loss
            else:
-            return output, None, None
+                return output, label, None
+        else:
+            if return_loss:
+                return None, None, loss
+            else:
+                return None, None, None
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@ -1,19 +1,19 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-from typing import Union
-
+from typing import List, Tuple, Union, Callable
+import inspect
 import torch.cuda
-import torch.distributed as dist
 from torch import Tensor

 from colossalai.communication import *
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.amp.naive_amp import NaiveAMPModel
+from colossalai.utils.cuda import get_current_device
 from colossalai.zero import (ZeroRedundancyOptimizer_Level_2,
                             ZeroRedundancyOptimizer_Level_3)
-from colossalai.utils import get_current_device, switch_virtual_pipeline_parallel_rank
+from colossalai.utils import switch_virtual_pipeline_parallel_rank
 from ._base_schedule import BaseSchedule


@ -30,102 +30,79 @@ class PipelineSchedule(BaseSchedule):
    :class:`NonPipelineSchedule`.

    :param num_microbatches: The number of microbatches
-    :param amp_type: The type of automatic mixed precision
-    :param amp_config: The configuration of automatic mixed procision
-    :param sync_data: If set to `True`, will sync data every batch over pipeline stages
    :type num_microbatches: int
-    :type amp_type: AMP_TYPE
-    :type amp_config: dict
-    :type sync_data: bool
+    :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
+    :type batch_data_process_func: Callable
    """

    def __init__(self,
                 num_microbatches,
-                 sync_data: bool = True):
-        super().__init__()
-
+                 batch_data_process_func: Callable = None,
+                 tensor_shape: Union[torch.Size, List[int], Tuple[int]] = None):
+        super().__init__(batch_data_process_func=batch_data_process_func)
        self.num_microbatches = num_microbatches
-        self.sync_data = sync_data
        self.dtype = torch.float
+        self.tensor_shape = tensor_shape

-    def _move_to_device(self, data):
-        if isinstance(data, (
-                tuple,
-                list,
-        )):
-            assert len(data) == 1, "Data tuple's length in pipeline should be 1"
-            data = data[0]
-        assert torch.is_tensor(data), "Data in pipeline should be tensor"
-        data = data.to(get_current_device()).detach()
-        return data
-
-    def _sync_data(self):
-        reqs = []
-        if gpc.is_first_rank(ParallelMode.PIPELINE):
-            src_rank = gpc.get_global_rank()
-            reqs.append(dist.broadcast(
-                tensor=self.batch_data,
-                src=src_rank,
-                group=gpc.get_group(ParallelMode.PIPELINE_PREV),
-                async_op=True
-            ))
-            reqs.append(dist.broadcast(
-                tensor=self.batch_label,
-                src=src_rank,
-                group=gpc.get_group(ParallelMode.PIPELINE_PREV),
-                async_op=True
-            ))
-        if gpc.is_last_rank(ParallelMode.PIPELINE):
-            src_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
-            reqs.append(dist.broadcast(
-                tensor=self.batch_data,
-                src=src_rank,
-                group=gpc.get_group(ParallelMode.PIPELINE_NEXT),
-                async_op=True
-            ))
-            reqs.append(dist.broadcast(
-                tensor=self.batch_label,
-                src=src_rank,
-                group=gpc.get_group(ParallelMode.PIPELINE_NEXT),
-                async_op=True
-            ))
-        for req in reqs:
-            req.wait()
-
-    # Pipeline schedule just puts data in memory
    def load_batch(self, data_iter):
-        if data_iter is None:
-            raise RuntimeError('Dataloader is not defined.')
-        self.batch_pos = 0
-        data, label = next(data_iter)
-        self.batch_data, self.batch_label = \
-            self._move_to_device(data), self._move_to_device(label)
-        batch_size = self.batch_data.shape[0]
-        assert batch_size % self.num_microbatches == 0, \
+        # Pipeline schedule just puts data in memory
+        self.batch_data, self.batch_label = super().load_batch(data_iter, to_gpu=False)
+        self.microbatch_offset = 0
+        assert self.batch_size % self.num_microbatches == 0, \
            "Batch size should divided by the number of microbatches"
-        self.microbatch_size = batch_size // self.num_microbatches
-        if self.sync_data:
-            self._sync_data()
+        self.microbatch_size = self.batch_size // self.num_microbatches

-    def _get_data_slice(self, tensor):
-        return tensor[self.batch_pos: self.batch_pos + self.microbatch_size]
+    def _get_data_slice(self, data, offset):
+        if isinstance(data, torch.Tensor):
+            return data[offset: offset + self.microbatch_size]
+        else:
+            return {k: v[offset:offset + self.microbatch_size] for k, v in data.items()}

    def load_micro_batch(self):
-        data = self._get_data_slice(self.batch_data)
-        label = self._get_data_slice(self.batch_label)
-        self.batch_pos += self.microbatch_size
-        return (data,), (label,)
+        data = self._get_data_slice(self.batch_data, self.microbatch_offset)
+        label = self._get_data_slice(self.batch_label, self.microbatch_offset)
+        self.microbatch_offset += self.microbatch_size
+        return self._move_to_device(data), self._move_to_device(label)

    def pre_processing(self, engine):
        if isinstance(engine.optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
            raise TypeError(
                "Pipeline schedule is currently not compatible with ZeRO Level 2 and Level 3"
            )
-
-        if isinstance(engine.model, NaiveAMPModel):
+        model = engine.model
+        if isinstance(model, NaiveAMPModel):
            self.dtype = torch.half
+            model = model.model
+        sig = inspect.signature(model.forward)
+        for p in sig.parameters.values():
+            assert p.kind != inspect.Parameter.VAR_POSITIONAL, '*args is not supported'

-    def forward_step(self, engine, input_tensor, return_tensors, return_loss=True):
+    @staticmethod
+    def _call_engine(model, input_tensor, batch_data):
+        if isinstance(model, NaiveAMPModel):
+            sig = inspect.signature(model.model.forward)
+        else:
+            sig = inspect.signature(model.forward)
+        if isinstance(batch_data, torch.Tensor):
+            if input_tensor is None:
+                return model(batch_data)
+            elif len(sig.parameters) > 1:
+                return model(input_tensor, batch_data)
+            else:
+                return model(input_tensor)
+        else:
+            filter_batch = True
+            for p in sig.parameters.values():
+                if p.kind == inspect.Parameter.VAR_KEYWORD:
+                    filter_batch = False
+            if filter_batch:
+                batch_data = {k: v for k, v in batch_data.items() if k in sig.parameters}
+            if input_tensor is None:
+                return model(**batch_data)
+            else:
+                return model(input_tensor, **batch_data)
+
+    def forward_step(self, engine, input_tensor, return_tensors, return_output_label=True, accum_loss=None):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
@ -140,26 +117,19 @@ class PipelineSchedule(BaseSchedule):
        :return: output or the loss value of the current pipeline stage
        :rtype: :class:`torch.Tensor`
        """
-
-        if input_tensor is None:
-            input_tensor, label = self.load_micro_batch()
-        input_tensor = squeeze(input_tensor)
-        output_tensor = engine(input_tensor)
+        data, label = self.load_micro_batch()
+        output_tensor = self._call_engine(engine.model, input_tensor, data)
        output_tensor = squeeze(output_tensor)

        if gpc.is_last_rank(ParallelMode.PIPELINE):
-            if return_loss:
-                input_tensor, label = self.load_micro_batch()
-                loss_reduced = engine.criterion(output_tensor, *label) \
-                    / self.num_microbatches
-
-                return_tensors.append(
-                    tuple((output_tensor, label[0], loss_reduced)))
+            if return_output_label:
+                return_tensors.append(tuple((output_tensor, label)))
+            if accum_loss is not None:
+                loss_reduced = self._call_engine_criterion(engine, output_tensor, label) / self.num_microbatches
+                accum_loss.add_(loss_reduced.detach())
                return loss_reduced
            else:
-                return_tensors.append(output_tensor)
                return output_tensor
-
        else:
            return output_tensor

@ -203,7 +173,8 @@ class PipelineSchedule(BaseSchedule):
                              engine,
                              data_iter,
                              forward_only=False,
-                              return_loss=True):
+                              return_loss=True,
+                              return_output_label=True):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

@ -215,6 +186,8 @@ class PipelineSchedule(BaseSchedule):
        :type forward_only: bool
        :param return_loss: whether returns the loss value. Default is true.
        :type return_loss: bool
+        :param return_output_label: If False, the output and label won't be returned
+        :type return_output_label: bool

        :return: (output, label, loss)
        :rtype: Tuple[:class:`torch.Tensor`]
@ -238,11 +211,14 @@ class PipelineSchedule(BaseSchedule):
            input_tensors = []
            output_tensors = []
        return_tensors = []
-
+        if return_loss and gpc.is_pipeline_last_stage(ignore_virtual=True):
+            accum_loss = torch.zeros(1, device=get_current_device())
+        else:
+            accum_loss = None
        # Used for tensor meta information communication
-        ft_shape = None
+        ft_shape = self.tensor_shape
        bt_shape = None
-        fs_checker = True
+        fs_checker = self.tensor_shape is None

        # Run warmup forward passes.
        for i in range(num_warmup_microbatches):
@ -251,7 +227,8 @@ class PipelineSchedule(BaseSchedule):
            input_tensor = recv_forward(ft_shape, dtype=self.dtype)
            output_tensor = self.forward_step(
                engine, input_tensor, return_tensors,
-                return_loss=return_loss
+                return_output_label=return_output_label,
+                accum_loss=accum_loss
            )
            if not gpc.is_last_rank(ParallelMode.PIPELINE):
                bt_shape = output_tensor.shape
@ -276,7 +253,8 @@ class PipelineSchedule(BaseSchedule):

            output_tensor = self.forward_step(
                engine, input_tensor, return_tensors,
-                return_loss=return_loss
+                return_output_label=return_output_label,
+                accum_loss=accum_loss
            )
            if forward_only:
                send_forward(output_tensor)
@ -327,24 +305,37 @@ class PipelineSchedule(BaseSchedule):
                send_backward(input_tensor_grad)

        if len(return_tensors) > 0:
-            if return_loss:
-                output, label, loss = tuple(map(list, zip(*return_tensors)))
+            output, label = tuple(map(list, zip(*return_tensors)))
            return (torch.cat(output, dim=0),
                    torch.cat(label, dim=0),
-                        sum(loss))
+                    accum_loss)
        else:
-                return tuple((torch.cat(return_tensors, dim=0), None, None))
-        else:
-            return tuple((None, None, None))
+            return tuple((None, None, accum_loss))


 class InterleavedPipelineSchedule(PipelineSchedule):
-    def __init__(self, num_microbatches, num_model_chunks, sync_data: bool = True):
+    def __init__(self,
+                 num_microbatches,
+                 num_model_chunks,
+                 batch_data_process_func: Callable = None,
+                 tensor_shape: Union[torch.Size, List[int], Tuple[int]] = None):
+        """A helper schedule class for pipeline parallelism running environment.
+        It uses interleaved 1F1B strategy. Other properties are similar as
+        :class:`NonPipelineSchedule`.
+
+        :param num_microbatches: The number of microbatches
+        :type num_microbatches: int
+        :param num_model_chunks: The number of model chunks
+        :type num_model_chunks: int
+        :param batch_data_process_func: The preprocessing function which receives a batch of data, and it will be executed in `load_batch`
+        :type batch_data_process_func: Callable
+        """
        assert num_microbatches % gpc.get_world_size(ParallelMode.PIPELINE) == 0, \
            'num_microbatches must be an integer multiple of pipeline parallel world size'
-        super().__init__(num_microbatches, sync_data=sync_data)
+        super().__init__(num_microbatches, batch_data_process_func=batch_data_process_func, tensor_shape=tensor_shape)
        gpc.set_virtual_pipeline_parallel_size(num_model_chunks)
        gpc.set_virtual_pipeline_parallel_rank(0)
+        self.num_model_chunks = num_model_chunks

    def pre_processing(self, engine):
        if isinstance(engine.optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
@ -355,32 +346,46 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        if isinstance(engine.model[0], NaiveAMPModel):
            self.dtype = torch.half

-    def forward_step(self, engine, model, input_tensor, return_tensors, return_loss=True):
+        for model in engine.model:
+            if isinstance(model, NaiveAMPModel):
+                model = model.model
+            sig = inspect.signature(model.forward)
+            for p in sig.parameters.values():
+                assert p.kind != inspect.Parameter.VAR_POSITIONAL, '*args is not supported'
+
+    def load_batch(self, data_iter):
+        super().load_batch(data_iter)
+        # overwrite microbatch_offset, since model chunks load the same microbatch, and should tract the offset
+        self.microbatch_offset = [0 for _ in range(self.num_model_chunks)]
+
+    def load_micro_batch(self, model_chunk_id):
+        data = self._get_data_slice(self.batch_data, self.microbatch_offset[model_chunk_id])
+        label = self._get_data_slice(self.batch_label, self.microbatch_offset[model_chunk_id])
+        self.microbatch_offset[model_chunk_id] += self.microbatch_size
+        return self._move_to_device(data), self._move_to_device(label)
+
+    def forward_step(self, engine, model_chunk_id, input_tensor, return_tensors, return_output_label=True, accum_loss=None):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
        """
-
-        if input_tensor is None:
-            input_tensor, label = self.load_micro_batch()
-        input_tensor = squeeze(input_tensor)
-        output_tensor = model(input_tensor)
+        data, label = self.load_micro_batch(model_chunk_id)
+        output_tensor = self._call_engine(engine.model[model_chunk_id], input_tensor, data)
        output_tensor = squeeze(output_tensor)

        if gpc.is_pipeline_last_stage():
-            if return_loss:
-                input_tensor, label = self.load_micro_batch()
-                loss_reduced = engine.criterion(output_tensor, *label) / self.num_microbatches
-                return_tensors.append(
-                    tuple((output_tensor, label[0], loss_reduced)))
+            if return_output_label:
+                return_tensors.append(tuple(output_tensor, label))
+            if accum_loss is not None:
+                loss_reduced = self._call_engine_criterion(engine, output_tensor, label) / self.num_microbatches
+                accum_loss.add_(loss_reduced.detach())
                return loss_reduced
            else:
-                return_tensors.append(output_tensor)
                return output_tensor
        else:
            return output_tensor

-    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True):
+    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
        """Run interleaved 1F1B schedule (model split into model chunks), with
        communication between pipeline stages as needed.

@ -394,11 +399,15 @@ class InterleavedPipelineSchedule(PipelineSchedule):
        return_tensors = []
        if not forward_only:
            output_tensor_grads = [[] for _ in range(len(model))]
+        if return_loss and gpc.is_pipeline_last_stage(ignore_virtual=True):
+            accum_loss = torch.zeros(1, device=get_current_device())
+        else:
+            accum_loss = None

        # Used for tensor meta information communication
-        input_tensor_shapes = [None for _ in range(len(model))]
+        input_tensor_shapes = [self.tensor_shape for _ in range(len(model))]
        output_tensor_shapes = [None for _ in range(len(model))]
-        send_tensor_shape_flags = [True for _ in range(len(model))]
+        send_tensor_shape_flags = [self.tensor_shape is None for _ in range(len(model))]

        pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
        pipeline_parallel_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
@ -450,8 +459,8 @@ class InterleavedPipelineSchedule(PipelineSchedule):
                        len(output_tensors[model_chunk_id]):
                    input_tensors[model_chunk_id].append(None)
            input_tensor = input_tensors[model_chunk_id][-1]
-            output_tensor = self.forward_step(
-                engine, model[model_chunk_id], input_tensor, return_tensors, return_loss=return_loss)
+            output_tensor = self.forward_step(engine, model_chunk_id, input_tensor,
+                                              return_tensors, return_output_label=return_output_label, accum_loss=accum_loss)
            output_tensors[model_chunk_id].append(output_tensor)

            # if forward-only, no need to save tensors for a backward pass
@ -633,12 +642,9 @@ class InterleavedPipelineSchedule(PipelineSchedule):
                        dtype=self.dtype))

        if len(return_tensors) > 0:
-            if return_loss:
-                output, label, loss = tuple(map(list, zip(*return_tensors)))
+            output, label = tuple(map(list, zip(*return_tensors)))
            return (torch.cat(output, dim=0),
                    torch.cat(label, dim=0),
-                        sum(loss))
+                    accum_loss)
        else:
-                return tuple((torch.cat(return_tensors, dim=0), None, None))
-        else:
-            return tuple((None, None, None))
+            return tuple((None, None, accum_loss))
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@ -338,6 +338,19 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
                    "Data parallel training is detected when using pipeline parallel, DataParallelGradientHandler is automatically "
                    "added even though not specified in the configuration",
                    ranks=[0])
+        # add pipeline parallel gradient handler, if pipeline shared module is detected
+        for param in model.parameters():
+            if getattr(param, 'pipeline_shared_module_pg', None) is not None:
+                if gradient_handler_cfg is None:
+                    gradient_handler_cfg = [dict(type='PipelineSharedModuleGradientHandler')]
+                else:
+                    gradient_handler_cfg.append(dict(type='PipelineSharedModuleGradientHandler'))
+                if verbose:
+                    logger.info(
+                        "pipeline_shared_module is detected, PipelineSharedModuleGradientHandler is automatically "
+                        "added even though not specified in the configuration",
+                        ranks=[0])
+                break
    else:
        if not isinstance(gradient_handler_cfg, list):
            raise ConfigException(
--- a/colossalai/nn/layer/colossalai_layer/_utils.py
+++ b/colossalai/nn/layer/colossalai_layer/_utils.py
@ -11,8 +11,8 @@ _parallel_split_batch = {'2d': split_tensor_2d, '2.5d': split_tensor_2p5d, '3d':
 def split_batch(input_) -> Tensor:
    tensor_parallel_mode = get_tensor_parallel_mode()
    if tensor_parallel_mode in _parallel_split_batch:
-        if isinstance(input_, (tuple, list)):
-            return tuple(map(_parallel_split_batch[tensor_parallel_mode], input_))
+        if isinstance(input_, dict):
+            return {k: _parallel_split_batch[tensor_parallel_mode](v) for k, v in input_.items()}
        else:
            return _parallel_split_batch[tensor_parallel_mode](input_)
    else:
--- a/colossalai/nn/layer/wrapper/init.py
+++ b/colossalai/nn/layer/wrapper/init.py
@ -1,3 +1,4 @@
 from .lambda_wrapper import LambdaWrapper
+from .pipeline_wrapper import PipelineSharedModuleWrapper

-__all__ = ['LambdaWrapper']
+__all__ = ['LambdaWrapper', 'PipelineSharedModuleWrapper']
--- a/colossalai/nn/layer/wrapper/pipeline_wrapper.py
+++ b/colossalai/nn/layer/wrapper/pipeline_wrapper.py
@ -0,0 +1,40 @@
+import torch.nn as nn
+import torch.distributed as dist
+from typing import List, Tuple, Union
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+
+
+class PipelineSharedModuleWrapper:
+    def __init__(self, pipeline_ranks: Union[List[int], Tuple[int]]) -> None:
+        assert len(pipeline_ranks) > 1, f'Expect len(pipeline_ranks) > 1, got {len(pipeline_ranks)}'
+        self.pipeline_ranks = pipeline_ranks
+        self.group = None
+        self.ranks_in_group = None
+        self._init_group()
+
+    def _init_group(self):
+        world_size = gpc.get_world_size(ParallelMode.GLOBAL)
+        dp_size = gpc.get_world_size(ParallelMode.DATA)
+        pp_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        rank = gpc.get_global_rank()
+        num_dp_groups = world_size // dp_size
+        num_pp_stages = num_dp_groups // pp_size
+        for i in range(dp_size):
+            for j in range(num_pp_stages):
+                pipeline_ranks = list(
+                    range(i * num_dp_groups + j,
+                          (i + 1) * num_dp_groups,
+                          num_pp_stages))
+                sub_ranks = [pipeline_ranks[idx] for idx in self.pipeline_ranks]
+                group = dist.new_group(sub_ranks)
+                if rank in sub_ranks:
+                    self.group = group
+                    self.ranks_in_group = sub_ranks
+
+    def register_module(self, module: nn.Module):
+        assert self.ranks_in_group is not None, f'Rank {gpc.get_local_rank(ParallelMode.PIPELINE)} is not in pipeline_ranks {self.pipeline_ranks}'
+        src = self.ranks_in_group[self.pipeline_ranks[0]]
+        for p in module.parameters():
+            setattr(p, 'pipeline_shared_module_pg', self.group)
+            dist.broadcast(p, src, group=self.group)
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
@ -155,7 +155,8 @@ class Trainer:
    def _train_epoch(self,
                     train_dataloader: DataLoader,
                     epoch: int = None,
-                     display_progress: bool = False):
+                     display_progress: bool = False,
+                     return_output_label: bool = True):
        # set training state
        self._engine.train()
        data_iter = iter(train_dataloader)
@ -175,7 +176,7 @@ class Trainer:
            # run 1 training step
            self.engine.zero_grad()
            logits, label, loss = self.schedule.forward_backward_step(
-                self.engine, data_iter, forward_only=False, return_loss=True)
+                self.engine, data_iter, forward_only=False, return_loss=True, return_output_label=return_output_label)
            self.engine.step()
            self._call_timer(action='stop', item='Train-step', keep_in_history=True)
            self._call_hooks('after_train_iter', output=(logits, label, loss))
@ -197,7 +198,8 @@ class Trainer:
    def _eval(self,
              test_dataloader: DataLoader,
              epoch: int = None,
-              display_progress: bool = False):
+              display_progress: bool = False,
+              return_output_label: bool = True):
        # switch engine status
        self._engine.eval()

@ -220,7 +222,7 @@ class Trainer:
                self._call_hooks('before_test_iter')
                self._call_timer(action='start', item='Test-step')
                logits, label, loss = self.schedule.forward_backward_step(
-                    self.engine, data_iter, forward_only=True, return_loss=True)
+                    self.engine, data_iter, forward_only=True, return_loss=True, return_output_label=return_output_label)
                self._call_timer(action='stop', item='Test-step', keep_in_history=True)
                self._call_hooks('after_test_iter',
                                 output=(logits, label, loss))
@ -246,6 +248,7 @@ class Trainer:
            test_interval: int = 1,
            hooks: List[BaseHook] = None,
            display_progress: bool = False,
+            return_output_label: bool = True,
            ):
        """Trains the model to fit training data.

@ -256,6 +259,8 @@ class Trainer:
        :param test_interval: Interval of testing
        :param hooks_cfg: A list of hook configuration
        :param display_progress: If True, the training progress will be printed
+        :param return_output_label: If True, the output of model and the label will be returned
+        :type return_output_label: bool
        :type train_dataloader: DataLoader
        :type epochs: int
        :type max_steps: int
@ -307,7 +312,8 @@ class Trainer:
            self._train_epoch(
                train_dataloader=train_dataloader,
                epoch=epoch,
-                display_progress=display_progress
+                display_progress=display_progress,
+                return_output_label=return_output_label
            )

            # start eval
@ -315,6 +321,7 @@ class Trainer:
                self._eval(test_dataloader=test_dataloader,
                           display_progress=display_progress,
                           epoch=epoch,
+                           return_output_label=return_output_label
                           )

            self._cur_epoch += 1
@ -331,13 +338,16 @@ class Trainer:
    def evaluate(self,
                 test_dataloader: DataLoader,
                 hooks: List[BaseHook] = None,
-                 display_progress: bool = False):
+                 display_progress: bool = False,
+                 return_output_label: bool = True):
        """Evaluates the model with testing data.

        :param test_dataloader: DataLoader in testing
        :param display_progress: If True, the evaluation progress will be printed
+        :param return_output_label: If True, the output of model and the label will be returned
        :type test_dataloader: DataLoader
        :type display_progress: bool, optional
+        :type return_output_label: bool
        """
        # set display
        display_progress = self._should_display_progress(display_progress)
@ -360,6 +370,7 @@ class Trainer:
        # eval
        self._eval(test_dataloader=test_dataloader,
                   display_progress=display_progress,
+                   return_output_label=return_output_label
                   )

    def predict(self, data: Union[Tensor, List[Tensor]]):
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@ -155,22 +155,12 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    if norm_type == inf:
        total_norm = max(p.grad.data.abs().max() for p in params)
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        ops = []
        # Take max across all model-parallel GPUs.
-        if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            ops.append(dist.all_reduce(total_norm_cuda,
+        if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
+            dist.all_reduce(total_norm_cuda,
                            op=dist.ReduceOp.MAX,
-                                       group=gpc.get_group(
-                                           ParallelMode.TENSOR),
-                                       async_op=True))
-        if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
-            ops.append(dist.all_reduce(total_norm_cuda,
-                                       op=dist.ReduceOp.MAX,
-                                       group=gpc.get_group(
-                                           ParallelMode.PIPELINE),
-                                       async_op=True))
-        for req in ops:
-            req.wait()
+                            group=gpc.get_group(ParallelMode.MODEL),
+                            async_op=False)
        total_norm = total_norm_cuda[0].item()
    else:
        tensor_parallel_grads = []
--- a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py
@ -65,6 +65,7 @@ class GradAccumOptimizer(ColossalaiOptimizer):
            self.optim.backward(scaled_loss)

    def backward_by_grad(self, tensor: Tensor, grad: Tensor):
+        self.accumulate_step += 1
        no_sync = self.is_torch_ddp and self.accumulate_step < self.accumulate_size

        if no_sync:
--- a/docs/add_your_parallel.md
+++ b/docs/add_your_parallel.md
@ -26,8 +26,6 @@ follow the steps below to create a new distributed initialization.
        GLOBAL = 'global'
        DATA = 'data'
        PIPELINE = 'pipe'
-        PIPELINE_PREV = 'pipe_prev'
-        PIPELINE_NEXT = 'pipe_next'
        ...

        NEW_MODE = 'new_mode'  # define your mode here
--- a/docs/add_your_parallel_zh.md
+++ b/docs/add_your_parallel_zh.md
@ -18,8 +18,6 @@ class ParallelMode(Enum):
    GLOBAL = 'global'
    DATA = 'data'
    PIPELINE = 'pipe'
-    PIPELINE_PREV = 'pipe_prev'
-    PIPELINE_NEXT = 'pipe_next'
    ...

    NEW_MODE = 'new_mode'  # define your mode here
--- a/tests/test_context/test_2d_init.py
+++ b/tests/test_context/test_2d_init.py
@ -33,6 +33,12 @@ def check_pipeline_parallel_rank(rank):
        assert gpc.get_local_rank(ParallelMode.PIPELINE) == 1


+def check_model_parallel_rank(rank):
+    for i in range(8):
+        if rank in [i, i+8]:
+            assert gpc.get_local_rank(ParallelMode.MODEL) == i
+
+
 def check_tensor_parallel_rank(rank):
    if rank in [0, 4, 8, 12]:
        assert gpc.get_local_rank(ParallelMode.TENSOR) == 0
@ -75,6 +81,7 @@ def init_2d(rank, world_size, backend, port, host):
    check_data_parallel_rank(rank)
    check_2d_parallel_rank(rank)
    check_pipeline_parallel_rank(rank)
+    check_model_parallel_rank(rank)
    gpc.destroy()
    torch.cuda.empty_cache()

--- a/tests/test_context/test_2p5d_init.py
+++ b/tests/test_context/test_2p5d_init.py
@ -37,6 +37,12 @@ def check_pipeline_parallel_rank(rank):
        assert ppr == 1


+def check_model_parallel_rank(rank):
+    for i in range(16):
+        if rank in [i, i+16]:
+            assert gpc.get_local_rank(ParallelMode.MODEL) == i
+
+
 def check_tensor_parallel_rank(rank):
    tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)

@ -98,6 +104,7 @@ def init_2halfd(rank, world_size, backend, port, host):
    check_pipeline_parallel_rank(rank)
    check_tensor_parallel_rank(rank)
    check_2p5d_parallel_rank(rank)
+    check_model_parallel_rank(rank)
    gpc.destroy()
    torch.cuda.empty_cache()

--- a/tests/test_context/test_3d_init.py
+++ b/tests/test_context/test_3d_init.py
@ -37,6 +37,12 @@ def check_pipeline_parallel_rank(rank):
        assert ppr == 1


+def check_model_parallel_rank(rank):
+    for i in range(16):
+        if rank in [i, i+16]:
+            assert gpc.get_local_rank(ParallelMode.MODEL) == i
+
+
 def check_tensor_parallel_rank(rank):
    tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)

@ -90,6 +96,7 @@ def init_3d(rank, world_size, backend, port, host):
    check_3d_parallel_rank(rank)
    check_data_parallel_rank(rank)
    check_pipeline_parallel_rank(rank)
+    check_model_parallel_rank(rank)
    gpc.destroy()
    torch.cuda.empty_cache()

--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
@ -23,7 +23,7 @@ BATCH_SIZE = 16
 NUM_EPOCHS = 60
 WARMUP_EPOCHS = 5
 CONFIG = dict(parallel=dict(pipeline=2, tensor=dict(size=2, mode='1d')),
-              fp16=dict(mode=AMP_TYPE.TORCH),
+              fp16=dict(mode=AMP_TYPE.NAIVE),
              gradient_accumulation=2)


--- a/tests/test_trainer/test_pipeline/test_p2p.py
+++ b/tests/test_trainer/test_pipeline/test_p2p.py
@ -75,40 +75,7 @@ def check_forward_backward(output_tensor, output_grad, rank, logger):
                rank, check_equal(grad, output_grad)))


-def check_op(size, rank, prev_rank, next_rank, up_group, down_group, logger):
-    dtype = torch.float32
-    device = get_current_device()
-    tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
-    # recv_tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
-    grad_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
-    tensor = torch.randn(tensor_shape, dtype=dtype, device=device)
-    dist.all_reduce(tensor)
-    grad = torch.randn(grad_shape, dtype=dtype, device=device)
-    dist.all_reduce(grad)
-    if rank % 2 == 0:
-        need_meta = True
-        need_meta = send_tensor_meta(tensor, need_meta)
-        logger.info('Rank {} shape sent (need meta: {}).'.format(
-            rank, need_meta))
-        req = dist.broadcast(tensor, src=rank, group=down_group, async_op=True)
-        req.wait()
-        out = tensor.clone()
-        logger.info('Rank {} test op: tensor sent.'.format(rank))
-    else:
-        recv_tensor_shape = recv_tensor_meta(None)
-        logger.info('Rank {} shape received. Correct shape: {}'.format(
-            rank, tensor_shape == recv_tensor_shape))
-        out = torch.empty(recv_tensor_shape, dtype=dtype, device=device)
-        req = dist.broadcast(out, src=prev_rank, group=up_group, async_op=True)
-        req.wait()
-        logger.info('Rank {} test op: received tensor ({})'.format(
-            rank, out.shape))
-
-    logger.info('Rank {} test op. Correct tensor: {}'.format(
-        rank, check_equal(tensor, out)))
-
-
-def check_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
+def check_comm(size, rank, prev_rank, next_rank,  logger):
    dtype = torch.float32
    device = get_current_device()
    tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE)
@ -117,7 +84,6 @@ def check_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
    dist.all_reduce(tensor)
    grad = torch.randn(grad_shape, dtype=dtype, device=device)
    dist.all_reduce(grad)
-    check_op(size, rank, prev_rank, next_rank, up_group, down_group, logger)
    check_forward(tensor, rank, logger)
    check_backward(grad, rank, logger)
    check_forward_backward(tensor, grad, rank, logger)
@ -135,18 +101,13 @@ def run_check(rank, world_size, port):
    logger = get_dist_logger()
    rank = gpc.get_global_rank()
    prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
-    up_ranks = gpc.get_ranks_in_group(ParallelMode.PIPELINE_PREV)
-    up_group = gpc.get_group(ParallelMode.PIPELINE_PREV)
    next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
-    down_ranks = gpc.get_ranks_in_group(ParallelMode.PIPELINE_NEXT)
-    down_group = gpc.get_group(ParallelMode.PIPELINE_NEXT)
    logger.info(
-        'Rank {0}: prev rank {1} (up: {2}), next rank {3} (down: {4})'.format(
-            rank, prev_rank, up_ranks, next_rank, down_ranks))
+        'Rank {0}: prev rank {1}, next rank {2}'.format(
+            rank, prev_rank, next_rank))
    logger.info('Distributed environment is initialzied.')

-    check_comm(world_size, rank, prev_rank, next_rank, up_group, down_group,
-               logger)
+    check_comm(world_size, rank, prev_rank, next_rank, logger)
    gpc.destroy()
    torch.cuda.empty_cache()