ColossalAI/colossalai/nn/parallel/layers/cache_embedding/freq_aware_embedding.py

import torch
import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple

from .base_embedding import BaseEmbeddingBag
from .cache_mgr import CachedParamMgr
from torch.nn.parameter import Parameter


class FreqAwareEmbeddingBag(BaseEmbeddingBag):

    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        padding_idx=None,
        max_norm=None,
        norm_type=2.,
        scale_grad_by_freq=False,
        sparse=False,
        _weight=None,
        mode='mean',
        include_last_offset=False,
        dtype=None,
        device=None,
        cuda_row_num=0,
        ids_freq_mapping=None,
        warmup_ratio=0.7,
        buffer_size=50_000,
        pin_weight=False,
    ):
        super(FreqAwareEmbeddingBag, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type,
                                                    scale_grad_by_freq, sparse, mode, include_last_offset)

        if _weight is None:
            _weight = self._weight_alloc(dtype, device)

        # configure weight & cache
        self._preprocess(_weight, cuda_row_num, ids_freq_mapping, warmup_ratio, buffer_size, pin_weight)

    def _weight_alloc(self, dtype, device):
        weight = torch.empty(self.num_embeddings, self.embedding_dim, dtype=dtype, device=device)
        with torch.no_grad():
            weight.data.uniform_(-1 / self.num_embeddings, 1 / self.num_embeddings)
            if self.padding_idx is not None:
                weight[self.padding_idx].fill_(0)
        return weight

    def _preprocess(self,
                    weight,
                    cuda_row_num: int,
                    ids_freq_mapping: Optional[List[int]] = None,
                    warmup_ratio=0.7,
                    buffer_size=50_000,
                    pin_weight=False):
        """
        Called after initialized. 
        Reorder the weight rows according to the ids_freq_mapping.
        Then, let the weights of the Module be managed by a CachedParamMgr.
        
        Args:
            cuda_row_num (int): number of rows can be hosted in CUDA memory
            ids_freq_mapping (List[int]): a list, idx is id number, value is freq
            warmup_ratio (float): the amount of rows preloaded in cuda cache
        """
        self.cache_weight_mgr = CachedParamMgr(weight, cuda_row_num, buffer_size, pin_weight)
        self.cache_weight_mgr.reorder(ids_freq_mapping, warmup_ratio)

    def forward(self, indices, offsets=None, per_sample_weights=None, shape_hook=None):
        with torch.no_grad():
            reorder_ids = self.cache_weight_mgr.prepare_ids(indices)

        embeddings = F.embedding_bag(reorder_ids, self.cache_weight_mgr.cuda_cached_weight, offsets, self.max_norm,
                                     self.norm_type, self.scale_grad_by_freq, self.mode, self.sparse,
                                     per_sample_weights, self.include_last_offset, self.padding_idx)
        if shape_hook is not None:
            embeddings = shape_hook(embeddings)
        return embeddings

    @property
    def weight(self):
        return self.cache_weight_mgr.weight

    def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Parameter]]:
        yield 'weight', self.cache_weight_mgr.cuda_cached_weight

    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
        yield self.cache_weight_mgr.cuda_cached_weight


############################# Perf Log ###################################

    @property
    def num_hits_history(self):
        return self.cache_weight_mgr.num_hits_history

    @property
    def num_miss_history(self):
        return self.cache_weight_mgr.num_miss_history

    @property
    def num_write_back_history(self):
        return self.cache_weight_mgr.num_write_back_history

    @property
    def swap_in_bandwidth(self):
        if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
            return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
                   self.cache_weight_mgr._cpu_to_cuda_elpase
        else:
            return 0

    @property
    def swap_out_bandwidth(self):
        if self.cache_weight_mgr._cuda_to_cpu_numel > 0:
            return self.cache_weight_mgr._cuda_to_cpu_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
                   self.cache_weight_mgr._cuda_to_cpu_elapse
        return 0

    @property
    def input_id_percent_in_load_chunk(self):
        return 0    # np.mean(self.cache_weight_mgr.input_id_percent_in_load_chunk) * 100
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`import torch`
			`import torch.nn.functional as F`
			`from typing import List, Optional, Iterator, Tuple`

			`from .base_embedding import BaseEmbeddingBag`
			`from .cache_mgr import CachedParamMgr`
			`from torch.nn.parameter import Parameter`


			`class FreqAwareEmbeddingBag(BaseEmbeddingBag):`

[FAW] reorganize the inheritance struct of FreqCacheEmbedding (#1448) 2022-08-12 07:55:46 +00:00			`def __init__(`
			`self,`
			`num_embeddings,`
			`embedding_dim,`
			`padding_idx=None,`
			`max_norm=None,`
			`norm_type=2.,`
			`scale_grad_by_freq=False,`
			`sparse=False,`
			`_weight=None,`
			`mode='mean',`
			`include_last_offset=False,`
			`dtype=None,`
			`device=None,`
			`cuda_row_num=0,`
			`ids_freq_mapping=None,`
			`warmup_ratio=0.7,`
			`buffer_size=50_000,`
[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`pin_weight=False,`
[FAW] reorganize the inheritance struct of FreqCacheEmbedding (#1448) 2022-08-12 07:55:46 +00:00			`):`
			`super(FreqAwareEmbeddingBag, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type,`
			`scale_grad_by_freq, sparse, mode, include_last_offset)`

			`if _weight is None:`
			`_weight = self._weight_alloc(dtype, device)`

			`# configure weight & cache`
[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`self._preprocess(_weight, cuda_row_num, ids_freq_mapping, warmup_ratio, buffer_size, pin_weight)`
[FAW] reorganize the inheritance struct of FreqCacheEmbedding (#1448) 2022-08-12 07:55:46 +00:00
			`def _weight_alloc(self, dtype, device):`
[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`weight = torch.empty(self.num_embeddings, self.embedding_dim, dtype=dtype, device=device)`
[FAW] reorganize the inheritance struct of FreqCacheEmbedding (#1448) 2022-08-12 07:55:46 +00:00			`with torch.no_grad():`
			`weight.data.uniform_(-1 / self.num_embeddings, 1 / self.num_embeddings)`
			`if self.padding_idx is not None:`
			`weight[self.padding_idx].fill_(0)`
			`return weight`

			`def _preprocess(self,`
			`weight,`
			`cuda_row_num: int,`
			`ids_freq_mapping: Optional[List[int]] = None,`
			`warmup_ratio=0.7,`
[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`buffer_size=50_000,`
			`pin_weight=False):`
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`"""`
			`Called after initialized.`
			`Reorder the weight rows according to the ids_freq_mapping.`
			`Then, let the weights of the Module be managed by a CachedParamMgr.`
[NFC] polish doc style for ColoTensor (#1457) 2022-08-16 01:21:05 +00:00
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`Args:`
			`cuda_row_num (int): number of rows can be hosted in CUDA memory`
			`ids_freq_mapping (List[int]): a list, idx is id number, value is freq`
			`warmup_ratio (float): the amount of rows preloaded in cuda cache`
			`"""`
[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`self.cache_weight_mgr = CachedParamMgr(weight, cuda_row_num, buffer_size, pin_weight)`
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`self.cache_weight_mgr.reorder(ids_freq_mapping, warmup_ratio)`

[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`def forward(self, indices, offsets=None, per_sample_weights=None, shape_hook=None):`
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`with torch.no_grad():`
			`reorder_ids = self.cache_weight_mgr.prepare_ids(indices)`

			`embeddings = F.embedding_bag(reorder_ids, self.cache_weight_mgr.cuda_cached_weight, offsets, self.max_norm,`
			`self.norm_type, self.scale_grad_by_freq, self.mode, self.sparse,`
			`per_sample_weights, self.include_last_offset, self.padding_idx)`
[FCE] update interface for frequency statistics in FreqCacheEmbedding (#1462) 2022-08-23 09:38:24 +00:00			`if shape_hook is not None:`
			`embeddings = shape_hook(embeddings)`
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`return embeddings`

			`@property`
			`def weight(self):`
[FAW] reorganize the inheritance struct of FreqCacheEmbedding (#1448) 2022-08-12 07:55:46 +00:00			`return self.cache_weight_mgr.weight`
Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00
			`def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Parameter]]:`
			`yield 'weight', self.cache_weight_mgr.cuda_cached_weight`

			`def parameters(self, recurse: bool = True) -> Iterator[Parameter]:`
			`yield self.cache_weight_mgr.cuda_cached_weight`

[FAW] reorganize the inheritance struct of FreqCacheEmbedding (#1448) 2022-08-12 07:55:46 +00:00
			`############################# Perf Log ###################################`

Add FreqAwareEmbeddingBag (#1421) 2022-08-09 08:26:12 +00:00			`@property`
			`def num_hits_history(self):`
			`return self.cache_weight_mgr.num_hits_history`

			`@property`
			`def num_miss_history(self):`
			`return self.cache_weight_mgr.num_miss_history`

			`@property`
			`def num_write_back_history(self):`
			`return self.cache_weight_mgr.num_write_back_history`

			`@property`
			`def swap_in_bandwidth(self):`
			`if self.cache_weight_mgr._cpu_to_cuda_numel > 0:`
			`return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \`
			`self.cache_weight_mgr._cpu_to_cuda_elpase`
			`else:`
			`return 0`

			`@property`
			`def swap_out_bandwidth(self):`
			`if self.cache_weight_mgr._cuda_to_cpu_numel > 0:`
			`return self.cache_weight_mgr._cuda_to_cpu_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \`
			`self.cache_weight_mgr._cuda_to_cpu_elapse`
			`return 0`

			`@property`
			`def input_id_percent_in_load_chunk(self):`
			`return 0 # np.mean(self.cache_weight_mgr.input_id_percent_in_load_chunk) * 100`