From 4ac8bfb07285a417dba3d302e477d6e57b0b6d5f Mon Sep 17 00:00:00 2001
From: CZYCW <czyczf@163.com>
Date: Wed, 15 Feb 2023 09:40:08 +0800
Subject: [PATCH] [NFC] polish colossalai/engine/gradient_handler/utils.py code
 style (#2708)

---
 colossalai/engine/gradient_handler/utils.py | 59 +++++++++++----------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/colossalai/engine/gradient_handler/utils.py b/colossalai/engine/gradient_handler/utils.py
index e92044b47..fca5f2ec9 100644
--- a/colossalai/engine/gradient_handler/utils.py
+++ b/colossalai/engine/gradient_handler/utils.py
@@ -1,29 +1,30 @@
-import torch.distributed as dist
-import torch.nn as nn
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from typing import Iterable
-
-
-def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None):
-    # get communication world size
-    comm_size = dist.get_world_size(group)
-    # bucketize and all-reduce
-    buckets = {}
-    # Pack the buckets.
-    for param in param_list:
-        if param.requires_grad and param.grad is not None:
-            tp = param.data.type()
-            if tp not in buckets:
-                buckets[tp] = []
-            buckets[tp].append(param)
-
-    # For each bucket, all-reduce and copy all-reduced grads.
-    for tp in buckets:
-        bucket = buckets[tp]
-        grads = [param.grad.data for param in bucket]
-        coalesced = _flatten_dense_tensors(grads)
-        coalesced /= comm_size
-
-        dist.all_reduce(coalesced, group=group)
-        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-            buf.copy_(synced)
+from typing import Iterable
+
+import torch.distributed as dist
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None):
+    # get communication world size
+    comm_size = dist.get_world_size(group)
+    # bucketize and all-reduce
+    buckets = {}
+    # Pack the buckets.
+    for param in param_list:
+        if param.requires_grad and param.grad is not None:
+            tp = param.data.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(param)
+
+    # For each bucket, all-reduce and copy all-reduced grads.
+    for tp in buckets:
+        bucket = buckets[tp]
+        grads = [param.grad.data for param in bucket]
+        coalesced = _flatten_dense_tensors(grads)
+        coalesced /= comm_size
+
+        dist.all_reduce(coalesced, group=group)
+        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+            buf.copy_(synced)