ColossalAI/colossalai/zero/shard_utils/bucket_tensor_shard_strateg...

from typing import List

import torch
import torch.distributed as dist
from colossalai.utils import get_current_device
from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor
from torch._utils import _flatten_dense_tensors as flatten

from .tensor_shard_strategy import TensorShardStrategy


class BucketTensorShardStrategy(TensorShardStrategy):

    def gather(self, tensor_list: List[ShardedTensor]):
        tensor_list: List[ShardedTensor] = [t for t in tensor_list if t.is_sharded]
        if len(tensor_list) == 0:
            return
        target_device = tensor_list[0].device
        dtype = tensor_list[0].dtype
        buffer_list: List[torch.Tensor] = []
        tensor_numels = [t.payload.numel() for t in tensor_list]
        buffer_size = sum(tensor_numels)
        for i in range(self.world_size):
            if i == self.local_rank:
                buffer_list.append(flatten([t.payload for t in tensor_list]).cuda(get_current_device()))
            else:
                buffer_list.append(torch.zeros(buffer_size, dtype=dtype, device=get_current_device()))
        dist.all_gather(buffer_list, buffer_list[self.local_rank], group=self.process_group)
        # Move to target device before splitting buffer
        # Ensure we utilize maximum PCIE bandwidth
        buffer_list = [buffer.to(target_device) for buffer in buffer_list]
        offset = 0
        for i, t in enumerate(tensor_list):
            gathered_payload = [buffer[offset:offset + tensor_numels[i]] for buffer in buffer_list]
            gathered_payload = torch.cat(gathered_payload)[:t.origin_numel].view(t.origin_shape)
            t.reset_payload(gathered_payload)
            t.is_sharded = False
            offset += tensor_numels[i]
add bucket tensor shard strategy 2022-03-14 06:48:32 +00:00			`from typing import List`

			`import torch`
			`import torch.distributed as dist`
			`from colossalai.utils import get_current_device`
			`from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor`
			`from torch._utils import _flatten_dense_tensors as flatten`

			`from .tensor_shard_strategy import TensorShardStrategy`


			`class BucketTensorShardStrategy(TensorShardStrategy):`

			`def gather(self, tensor_list: List[ShardedTensor]):`
			`tensor_list: List[ShardedTensor] = [t for t in tensor_list if t.is_sharded]`
			`if len(tensor_list) == 0:`
			`return`
			`target_device = tensor_list[0].device`
			`dtype = tensor_list[0].dtype`
			`buffer_list: List[torch.Tensor] = []`
			`tensor_numels = [t.payload.numel() for t in tensor_list]`
			`buffer_size = sum(tensor_numels)`
			`for i in range(self.world_size):`
			`if i == self.local_rank:`
			`buffer_list.append(flatten([t.payload for t in tensor_list]).cuda(get_current_device()))`
			`else:`
			`buffer_list.append(torch.zeros(buffer_size, dtype=dtype, device=get_current_device()))`
			`dist.all_gather(buffer_list, buffer_list[self.local_rank], group=self.process_group)`
			`# Move to target device before splitting buffer`
			`# Ensure we utilize maximum PCIE bandwidth`
			`buffer_list = [buffer.to(target_device) for buffer in buffer_list]`
			`offset = 0`
			`for i, t in enumerate(tensor_list):`
			`gathered_payload = [buffer[offset:offset + tensor_numels[i]] for buffer in buffer_list]`
			`gathered_payload = torch.cat(gathered_payload)[:t.origin_numel].view(t.origin_shape)`
			`t.reset_payload(gathered_payload)`
			`t.is_sharded = False`
			`offset += tensor_numels[i]`