From 7c70bfbefa8484011eeb704cea1470449873e5e3 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Thu, 14 Jul 2022 17:31:13 +0800
Subject: [PATCH] [hotfix] fix PipelineSharedModuleGradientHandler (#1314)

---
 .../_pipeline_parallel_gradient_handler.py               | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
index 31a3dccee..83f5c00cf 100644
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -33,14 +33,19 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
             # Pack the buckets.
             for param in self._model.parameters():
                 group = getattr(param, 'pipeline_shared_module_pg', None)
-                if param.requires_grad and param.grad is not None and group is not None:
+                if param.requires_grad and group is not None and (
+                    (hasattr(param, 'colo_attr') and not param.colo_attr.saved_grad.is_null())
+                        or param.grad is not None):
                     tp = param.data.type()
                     buckets[group][tp].append(param)
 
             # For each bucket, all-reduce and copy all-reduced grads.
             for group, group_buckets in buckets.items():
                 for tp, bucket in group_buckets.items():
-                    grads = [param.grad.data for param in bucket]
+                    grads = [
+                        param.colo_attr.grad_payload if hasattr(param, 'colo_attr') else param.grad.data
+                        for param in bucket
+                    ]
                     coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
                     dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
                     for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):