From 4537d39df94e1eca56348aa249338790195065ee Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Wed, 31 Aug 2022 13:52:30 +0800 Subject: [PATCH] [doc] docstring for FreqAwareEmbeddingBag (#1525) --- .../layers/cache_embedding/cache_mgr.py | 8 +-- .../cache_embedding/freq_aware_embedding.py | 62 +++++++++++++------ 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py index 08c206ec9..ee7ce0607 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py +++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py @@ -25,11 +25,9 @@ class CachedParamMgr(torch.nn.Module): weight (torch.Tensor): the weight of the Embedding layer. cuda_row_num (int, optional): the number of rows cached in CUDA memory. Defaults to 0. buffer_size (int, optional): the number of rows in a data transmitter buffer. Defaults to 50_000. - pin_weight (bool, optional): use pin memory to store the cpu weight. If set `True`, the cpu memory usage will increase largely. Defaults to False. - evict_strategy (EvictionStrategy, optional): the eviction strategy. There are two options. - `EvictionStrategy.LFU`: use the least frequently used cache. - `EvictionStrategy.DATASET`: use the stats collected from the target dataset. It usually leads to less cpu-gpu communication volume. - Defaults to EvictionStrategy.DATASET. + pin_weight (bool, optional): use pin memory to store the cpu weight. If set `True`, the cpu memory usage will increase largely. Defaults to False. + evict_strategy (EvictionStrategy, optional): the eviction strategy. There are two options. `EvictionStrategy.LFU` uses the least frequently used cache. `EvictionStrategy.DATASET`: use the stats collected from the target dataset. It usually leads to less cpu-gpu communication volume. + Default as EvictionStrategy.DATASET. use_cpu_caching (bool, optional): use cpu to execute cache indexing. It is slower than use gpu. """ diff --git a/colossalai/nn/parallel/layers/cache_embedding/freq_aware_embedding.py b/colossalai/nn/parallel/layers/cache_embedding/freq_aware_embedding.py index 58352a70d..ca911f9d1 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/freq_aware_embedding.py +++ b/colossalai/nn/parallel/layers/cache_embedding/freq_aware_embedding.py @@ -1,6 +1,6 @@ import torch import torch.nn.functional as F -from typing import List, Optional, Iterator, Tuple +from typing import List, Optional, Iterator, Tuple, Union from .base_embedding import BaseEmbeddingBag from .cache_mgr import CachedParamMgr, EvictionStrategy @@ -8,25 +8,51 @@ from torch.nn.parameter import Parameter class FreqAwareEmbeddingBag(BaseEmbeddingBag): + """FreqAwareEmbeddingBag + + Frequency Aware Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space. + It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`. + You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU. + + Args: + num_embeddings (int): size of the dictionary of embeddings + embedding_dim (int): the size of each embedding vector + padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction. + max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm + norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.. + scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False. + sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False. + _weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None. + mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'. + include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False. + dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32. + device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu. + cuda_row_num (int, optional): the max number of embedding vector in cuda cache. Defaults to 0. + ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None. + warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7. + buffer_size (int, optional): the max number of vectors in transmitter buffer. Defaults to 50_000. + pin_weight (bool, optional): pin the cpu weight. Defaults to False. + evict_strategy (EvictionStrategy, optional): evict strategy of the software cache. Defaults to EvictionStrategy.DATASET. + """ def __init__(self, - num_embeddings, - embedding_dim, - padding_idx=None, - max_norm=None, - norm_type=2., - scale_grad_by_freq=False, - sparse=False, - _weight=None, - mode='mean', - include_last_offset=False, - dtype=None, - device=None, - cuda_row_num=0, - ids_freq_mapping=None, - warmup_ratio=0.7, - buffer_size=50_000, - pin_weight=False, + num_embeddings: int, + embedding_dim: int, + padding_idx: int = None, + max_norm: float = None, + norm_type: float = 2., + scale_grad_by_freq: bool = False, + sparse: bool = False, + _weight: Optional[torch.Tensor] = None, + mode: str = 'mean', + include_last_offset: bool = False, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + cuda_row_num: int = 0, + ids_freq_mapping: Optional[Union[List, torch.Tensor]] = None, + warmup_ratio: float = 0.7, + buffer_size: int = 50_000, + pin_weight: bool = False, evict_strategy: EvictionStrategy = EvictionStrategy.DATASET): super(FreqAwareEmbeddingBag, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse, mode, include_last_offset)