mirror of https://github.com/hpcaitech/ColossalAI
[nfc] fix typo colossalai/nn (#3887)
* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmupedpull/3898/head^2
parent
ae02d4e4f7
commit
1878749753
|
@ -195,7 +195,7 @@ class _Linear(nn.Module):
|
|||
keep_master_weight_for_test: This was added for testing and should be
|
||||
set to False. It returns the master weights
|
||||
used for initialization.
|
||||
skip_bias_add: This was added to enable performance optimations where bias
|
||||
skip_bias_add: This was added to enable performance optimizations where bias
|
||||
can be fused with other elementwise operations. we skip
|
||||
adding bias but instead return it.
|
||||
"""
|
||||
|
|
|
@ -21,7 +21,7 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
|
|||
# Subtract the maximum value.
|
||||
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
|
||||
|
||||
# Get the partition's vocab indecies
|
||||
# Get the partition's vocab indices
|
||||
partition_vocab_size = vocab_parallel_logits.size()[-1]
|
||||
rank = dist.get_rank(process_group)
|
||||
vocab_start_index = partition_vocab_size * rank
|
||||
|
@ -61,10 +61,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
|
|||
@custom_bwd
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# Retreive tensors from the forward path.
|
||||
# Retrieve tensors from the forward path.
|
||||
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
||||
|
||||
# All the inputs have softmax as thier gradient.
|
||||
# All the inputs have softmax as their gradient.
|
||||
grad_input = softmax
|
||||
# For simplicity, work with the 2D gradient.
|
||||
partition_vocab_size = softmax.size()[-1]
|
||||
|
|
|
@ -106,7 +106,7 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
|
|||
@staticmethod
|
||||
@custom_bwd
|
||||
def backward(ctx, output_grad):
|
||||
# Retreive tensors from the forward path.
|
||||
# Retrieve tensors from the forward path.
|
||||
softmax, target_mask, masked_target = ctx.saved_tensors
|
||||
|
||||
# All the inputs have softmax as their gradient.
|
||||
|
|
|
@ -100,7 +100,7 @@ class _VocabParallelCrossEntropy2p5D(torch.autograd.Function):
|
|||
@staticmethod
|
||||
@custom_bwd
|
||||
def backward(ctx, output_grad):
|
||||
# Retreive tensors from the forward path.
|
||||
# Retrieve tensors from the forward path.
|
||||
softmax, target_mask, masked_target = ctx.saved_tensors
|
||||
|
||||
# All the inputs have softmax as their gradient.
|
||||
|
|
|
@ -99,10 +99,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
|
|||
@staticmethod
|
||||
@custom_bwd
|
||||
def backward(ctx, output_grad):
|
||||
# Retreive tensors from the forward path.
|
||||
# Retrieve tensors from the forward path.
|
||||
softmax, target_mask, masked_target = ctx.saved_tensors
|
||||
|
||||
# All the inputs have softmax as thier gradient.
|
||||
# All the inputs have softmax as their gradient.
|
||||
input_grad = softmax
|
||||
# For simplicity, work with the 2D gradient.
|
||||
partition_vocab_size = softmax.size()[-1]
|
||||
|
|
|
@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):
|
|||
|
||||
`CPUAdam` requires CUDA extensions which can be built during installation or runtime.
|
||||
|
||||
This version of CPU Adam accelates parameters updating on CPU with SIMD.
|
||||
This version of CPU Adam accelerates parameters updating on CPU with SIMD.
|
||||
Support of AVX2 or AVX512 is required.
|
||||
|
||||
The GPU part is implemented in an naive way.
|
||||
|
|
|
@ -59,7 +59,7 @@ class Lamb(Optimizer):
|
|||
continue
|
||||
grad = p.grad.data
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
|
||||
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
|||
self.offloader = None
|
||||
self.is_on_nvme: Dict[Parameter, bool] = {}
|
||||
self.offloaded_numel: int = 0
|
||||
# As param may be not materialized here, these attributes are initalized when the first step
|
||||
# As param may be not materialized here, these attributes are initialized when the first step
|
||||
self.total_numel: Optional[int] = None
|
||||
self.can_offload_numel: Optional[int] = None
|
||||
|
||||
|
|
|
@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
|||
|
||||
Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
|
||||
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
|
||||
You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
|
||||
You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
|
||||
|
||||
Args:
|
||||
num_embeddings (int): size of the dictionary of embeddings
|
||||
embedding_dim (int): the size of each embedding vector
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
|
||||
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
|
||||
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
|
||||
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
|
||||
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
|
||||
_weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
|
||||
_weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
|
||||
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
|
||||
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
||||
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
||||
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
||||
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
||||
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
|
||||
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
|
||||
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
||||
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
|
||||
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
|
||||
|
@ -145,7 +145,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
|||
def swap_in_bandwidth(self):
|
||||
if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
|
||||
return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
|
||||
self.cache_weight_mgr._cpu_to_cuda_elpase
|
||||
self.cache_weight_mgr._cpu_to_cuda_elapse
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ class LimitBuffIndexCopyer(object):
|
|||
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
|
||||
"""copy
|
||||
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
|
||||
The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
|
||||
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
|
||||
|
||||
Args:
|
||||
dim (int): dimension along which to index
|
||||
|
|
|
@ -114,7 +114,7 @@ class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
|
|||
|
||||
# get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
|
||||
local_output = torch.cat(local_output_list, 1)
|
||||
# then concatenate those local_output on the second demension.
|
||||
# then concatenate those local_output on the second dimension.
|
||||
# use all_to_all
|
||||
remains = batch_size % self.world_size
|
||||
scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]
|
||||
|
|
Loading…
Reference in New Issue