|
|
@ -1,6 +1,6 @@
|
|
|
|
import torch
|
|
|
|
import torch
|
|
|
|
import torch.nn.functional as F
|
|
|
|
import torch.nn.functional as F
|
|
|
|
from typing import List, Optional, Iterator, Tuple
|
|
|
|
from typing import List, Optional, Iterator, Tuple, Union
|
|
|
|
|
|
|
|
|
|
|
|
from .base_embedding import BaseEmbeddingBag
|
|
|
|
from .base_embedding import BaseEmbeddingBag
|
|
|
|
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
|
|
|
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
|
|
@ -8,25 +8,51 @@ from torch.nn.parameter import Parameter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FreqAwareEmbeddingBag(BaseEmbeddingBag):
|
|
|
|
class FreqAwareEmbeddingBag(BaseEmbeddingBag):
|
|
|
|
|
|
|
|
"""FreqAwareEmbeddingBag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Frequency Aware Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
|
|
|
|
|
|
|
|
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
|
|
|
|
|
|
|
|
You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
num_embeddings (int): size of the dictionary of embeddings
|
|
|
|
|
|
|
|
embedding_dim (int): the size of each embedding vector
|
|
|
|
|
|
|
|
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
|
|
|
|
|
|
|
|
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
|
|
|
|
|
|
|
|
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
|
|
|
|
|
|
|
|
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
|
|
|
|
|
|
|
|
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
|
|
|
|
|
|
|
|
_weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
|
|
|
|
|
|
|
|
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
|
|
|
|
|
|
|
|
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
|
|
|
|
|
|
|
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
|
|
|
|
|
|
|
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
|
|
|
|
|
|
|
cuda_row_num (int, optional): the max number of embedding vector in cuda cache. Defaults to 0.
|
|
|
|
|
|
|
|
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
|
|
|
|
|
|
|
|
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
|
|
|
|
|
|
|
buffer_size (int, optional): the max number of vectors in transmitter buffer. Defaults to 50_000.
|
|
|
|
|
|
|
|
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
|
|
|
|
|
|
|
|
evict_strategy (EvictionStrategy, optional): evict strategy of the software cache. Defaults to EvictionStrategy.DATASET.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
def __init__(self,
|
|
|
|
num_embeddings,
|
|
|
|
num_embeddings: int,
|
|
|
|
embedding_dim,
|
|
|
|
embedding_dim: int,
|
|
|
|
padding_idx=None,
|
|
|
|
padding_idx: int = None,
|
|
|
|
max_norm=None,
|
|
|
|
max_norm: float = None,
|
|
|
|
norm_type=2.,
|
|
|
|
norm_type: float = 2.,
|
|
|
|
scale_grad_by_freq=False,
|
|
|
|
scale_grad_by_freq: bool = False,
|
|
|
|
sparse=False,
|
|
|
|
sparse: bool = False,
|
|
|
|
_weight=None,
|
|
|
|
_weight: Optional[torch.Tensor] = None,
|
|
|
|
mode='mean',
|
|
|
|
mode: str = 'mean',
|
|
|
|
include_last_offset=False,
|
|
|
|
include_last_offset: bool = False,
|
|
|
|
dtype=None,
|
|
|
|
dtype: Optional[torch.dtype] = None,
|
|
|
|
device=None,
|
|
|
|
device: Optional[torch.device] = None,
|
|
|
|
cuda_row_num=0,
|
|
|
|
cuda_row_num: int = 0,
|
|
|
|
ids_freq_mapping=None,
|
|
|
|
ids_freq_mapping: Optional[Union[List, torch.Tensor]] = None,
|
|
|
|
warmup_ratio=0.7,
|
|
|
|
warmup_ratio: float = 0.7,
|
|
|
|
buffer_size=50_000,
|
|
|
|
buffer_size: int = 50_000,
|
|
|
|
pin_weight=False,
|
|
|
|
pin_weight: bool = False,
|
|
|
|
evict_strategy: EvictionStrategy = EvictionStrategy.DATASET):
|
|
|
|
evict_strategy: EvictionStrategy = EvictionStrategy.DATASET):
|
|
|
|
super(FreqAwareEmbeddingBag, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type,
|
|
|
|
super(FreqAwareEmbeddingBag, self).__init__(num_embeddings, embedding_dim, padding_idx, max_norm, norm_type,
|
|
|
|
scale_grad_by_freq, sparse, mode, include_last_offset)
|
|
|
|
scale_grad_by_freq, sparse, mode, include_last_offset)
|
|
|
|