2022-03-18 08:18:31 +00:00
|
|
|
from typing import List, Optional
|
2022-03-14 06:48:32 +00:00
|
|
|
|
|
|
|
import torch
|
|
|
|
import torch.distributed as dist
|
|
|
|
from colossalai.utils import get_current_device
|
|
|
|
from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor
|
2022-03-18 08:18:31 +00:00
|
|
|
from torch._utils import _flatten_dense_tensors as flatten
|
2022-03-14 06:48:32 +00:00
|
|
|
|
|
|
|
from .tensor_shard_strategy import TensorShardStrategy
|
|
|
|
|
|
|
|
|
|
|
|
class BucketTensorShardStrategy(TensorShardStrategy):
|
2022-03-18 08:48:20 +00:00
|
|
|
"""Use the same shard scheme as `TensorShardStrategy`'s, but it gathers tensors of a sub-module together,
|
|
|
|
which will fully utilize network bandwidth.
|
|
|
|
It is especially useful when sub-module contains bias,
|
|
|
|
since we cannot utilize network bandwidth well if we only gather a bias tensor (bias is usaully small).
|
|
|
|
"""
|
2022-03-14 06:48:32 +00:00
|
|
|
|
2022-03-18 08:18:31 +00:00
|
|
|
def gather(self, tensor_list: List[ShardedTensor], process_group: Optional[dist.ProcessGroup] = None):
|
2022-03-25 10:03:32 +00:00
|
|
|
|
2022-03-14 06:48:32 +00:00
|
|
|
tensor_list: List[ShardedTensor] = [t for t in tensor_list if t.is_sharded]
|
|
|
|
if len(tensor_list) == 0:
|
|
|
|
return
|
|
|
|
target_device = tensor_list[0].device
|
|
|
|
dtype = tensor_list[0].dtype
|
|
|
|
buffer_list: List[torch.Tensor] = []
|
|
|
|
tensor_numels = [t.payload.numel() for t in tensor_list]
|
|
|
|
buffer_size = sum(tensor_numels)
|
2022-03-18 08:18:31 +00:00
|
|
|
world_size = dist.get_world_size(process_group)
|
|
|
|
rank = dist.get_rank(process_group)
|
|
|
|
for i in range(world_size):
|
|
|
|
if i == rank:
|
2022-03-14 06:48:32 +00:00
|
|
|
buffer_list.append(flatten([t.payload for t in tensor_list]).cuda(get_current_device()))
|
2022-03-14 07:48:55 +00:00
|
|
|
# Release payload here, to decrease peak memory usage
|
|
|
|
for t in tensor_list:
|
|
|
|
t.reset_payload(None)
|
2022-03-14 06:48:32 +00:00
|
|
|
else:
|
|
|
|
buffer_list.append(torch.zeros(buffer_size, dtype=dtype, device=get_current_device()))
|
2022-03-18 08:18:31 +00:00
|
|
|
dist.all_gather(buffer_list, buffer_list[rank], group=process_group)
|
2022-03-14 06:48:32 +00:00
|
|
|
# Move to target device before splitting buffer
|
|
|
|
# Ensure we utilize maximum PCIE bandwidth
|
|
|
|
buffer_list = [buffer.to(target_device) for buffer in buffer_list]
|
|
|
|
offset = 0
|
|
|
|
for i, t in enumerate(tensor_list):
|
|
|
|
gathered_payload = [buffer[offset:offset + tensor_numels[i]] for buffer in buffer_list]
|
|
|
|
gathered_payload = torch.cat(gathered_payload)[:t.origin_numel].view(t.origin_shape)
|
|
|
|
t.reset_payload(gathered_payload)
|
|
|
|
t.is_sharded = False
|
|
|
|
offset += tensor_numels[i]
|