[nfc] fix typo colossalai/nn (#3887)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/

* fix typo colossalai/cli fx kernel

* fix typo colossalai/nn

* revert change warmuped
pull/3898/head^2
digger yu 2023-06-05 16:04:27 +08:00 committed by GitHub
parent ae02d4e4f7
commit 1878749753
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 18 additions and 18 deletions

View File

@ -195,7 +195,7 @@ class _Linear(nn.Module):
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: This was added to enable performance optimations where bias
skip_bias_add: This was added to enable performance optimizations where bias
can be fused with other elementwise operations. we skip
adding bias but instead return it.
"""

View File

@ -21,7 +21,7 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
# Subtract the maximum value.
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
# Get the partition's vocab indecies
# Get the partition's vocab indices
partition_vocab_size = vocab_parallel_logits.size()[-1]
rank = dist.get_rank(process_group)
vocab_start_index = partition_vocab_size * rank
@ -61,10 +61,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
@custom_bwd
def backward(ctx, grad_output):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target_1d = ctx.saved_tensors
# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
grad_input = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]

View File

@ -106,7 +106,7 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors
# All the inputs have softmax as their gradient.

View File

@ -100,7 +100,7 @@ class _VocabParallelCrossEntropy2p5D(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors
# All the inputs have softmax as their gradient.

View File

@ -99,10 +99,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, output_grad):
# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target = ctx.saved_tensors
# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
input_grad = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]

View File

@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):
`CPUAdam` requires CUDA extensions which can be built during installation or runtime.
This version of CPU Adam accelates parameters updating on CPU with SIMD.
This version of CPU Adam accelerates parameters updating on CPU with SIMD.
Support of AVX2 or AVX512 is required.
The GPU part is implemented in an naive way.

View File

@ -59,7 +59,7 @@ class Lamb(Optimizer):
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')
state = self.state[p]

View File

@ -43,7 +43,7 @@ class NVMeOptimizer(torch.optim.Optimizer):
self.offloader = None
self.is_on_nvme: Dict[Parameter, bool] = {}
self.offloaded_numel: int = 0
# As param may be not materialized here, these attributes are initalized when the first step
# As param may be not materialized here, these attributes are initialized when the first step
self.total_numel: Optional[int] = None
self.can_offload_numel: Optional[int] = None

View File

@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed pad. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
_weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
_weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
@ -145,7 +145,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
def swap_in_bandwidth(self):
if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
self.cache_weight_mgr._cpu_to_cuda_elpase
self.cache_weight_mgr._cpu_to_cuda_elapse
else:
return 0

View File

@ -17,7 +17,7 @@ class LimitBuffIndexCopyer(object):
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
"""copy
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
Args:
dim (int): dimension along which to index

View File

@ -114,7 +114,7 @@ class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
# get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
local_output = torch.cat(local_output_list, 1)
# then concatenate those local_output on the second demension.
# then concatenate those local_output on the second dimension.
# use all_to_all
remains = batch_size % self.world_size
scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]