ColossalAI/colossalai/nn/parallel/layers/cache_embedding/copyer.py

import torch
from torch import LongTensor


class LimitBuffIndexCopyer(object):
    """LimitBuffIndexCopyer 
    Index Copy using limited temp buffer on CUDA.

    Args:
        size (int): buffer size
    """

    def __init__(self, size: int) -> None:
        self._buff_size = size

    @torch.no_grad()
    def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
        """copy 
        src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
        The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.

        Args:
            dim (int):  dimension along which to index
            src_index (int): indices of src tensor to select from
            tgt_index (int): indices of tgt tensor to select from
            src (torch.Tensor):  the tensor containing values to copy
            tgt (torch.Tensor):  the tensor to be copied
        """
        # tgt.index_copy_(dim, index, src)
        assert dim == 0, "only support index_copy on dim 0"
        assert tgt.dim() == 2
        assert src.dim() == 2
        tgt_device = tgt.device
        src_device = src.device

        assert src_index.numel() == tgt_index.numel()
        dim_size = src_index.numel()
        src_index = src_index.to(src_device)
        for begin_pos in range(0, dim_size, self._buff_size):
            cur_len = min(self._buff_size, dim_size - begin_pos)
            src_idx_piece = src_index.narrow(0, begin_pos, cur_len)
            if src_device.type == 'cpu' and tgt_device.type == 'cuda':
                cpu_tmp_buffer = src.index_select(dim, src_idx_piece).pin_memory()
                tmp_buffer = torch.empty_like(cpu_tmp_buffer, device=tgt_device)
                tmp_buffer.copy_(cpu_tmp_buffer)
            else:
                tmp_buffer = src.index_select(dim, src_idx_piece).to(tgt_device)
            tgt_idx_piece = tgt_index.narrow(0, begin_pos, cur_len)
            tgt.index_copy_(dim, tgt_idx_piece, tmp_buffer)
[FAW] add cache manager for the cached embedding (#1419) 2022-08-09 07:17:17 +00:00			`import torch`
			`from torch import LongTensor`


			`class LimitBuffIndexCopyer(object):`
			`"""LimitBuffIndexCopyer`
			`Index Copy using limited temp buffer on CUDA.`

			`Args:`
			`size (int): buffer size`
			`"""`

			`def __init__(self, size: int) -> None:`
			`self._buff_size = size`

			`@torch.no_grad()`
			`def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):`
			`"""copy`
[NFC] polish doc style for ColoTensor (#1457) 2022-08-16 01:21:05 +00:00			`src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]`
[nfc] fix typo colossalai/nn (#3887) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped 2023-06-05 08:04:27 +00:00			`The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.`
[NFC] polish doc style for ColoTensor (#1457) 2022-08-16 01:21:05 +00:00
[FAW] add cache manager for the cached embedding (#1419) 2022-08-09 07:17:17 +00:00			`Args:`
			`dim (int): dimension along which to index`
			`src_index (int): indices of src tensor to select from`
			`tgt_index (int): indices of tgt tensor to select from`
			`src (torch.Tensor): the tensor containing values to copy`
			`tgt (torch.Tensor): the tensor to be copied`
			`"""`
			`# tgt.index_copy_(dim, index, src)`
			`assert dim == 0, "only support index_copy on dim 0"`
			`assert tgt.dim() == 2`
			`assert src.dim() == 2`
			`tgt_device = tgt.device`
			`src_device = src.device`

			`assert src_index.numel() == tgt_index.numel()`
			`dim_size = src_index.numel()`
			`src_index = src_index.to(src_device)`
			`for begin_pos in range(0, dim_size, self._buff_size):`
			`cur_len = min(self._buff_size, dim_size - begin_pos)`
			`src_idx_piece = src_index.narrow(0, begin_pos, cur_len)`
			`if src_device.type == 'cpu' and tgt_device.type == 'cuda':`
			`cpu_tmp_buffer = src.index_select(dim, src_idx_piece).pin_memory()`
			`tmp_buffer = torch.empty_like(cpu_tmp_buffer, device=tgt_device)`
			`tmp_buffer.copy_(cpu_tmp_buffer)`
			`else:`
			`tmp_buffer = src.index_select(dim, src_idx_piece).to(tgt_device)`
			`tgt_idx_piece = tgt_index.narrow(0, begin_pos, cur_len)`
			`tgt.index_copy_(dim, tgt_idx_piece, tmp_buffer)`