[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779)

* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc.
2023-05-23 15:28:20 +08:00 · 2023-05-23 15:28:20 +08:00 · 9265f2d4d7
parent e871e342b3
commit 9265f2d4d7
16 changed files with 46 additions and 46 deletions
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@ -6,7 +6,7 @@ from tqdm import tqdm
 from .utils import is_rank_0


-# Dahaos/rm-static
+# Dahoas/rm-static
 class RmStaticDataset(Dataset):
    """
    Dataset for reward model
--- a/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/embedding_handler.py
@ -155,7 +155,7 @@ class EmbeddingModuleHandler(ModuleHandler):
        Convert the sharding spec from the logical shape to the physical shape.
        """
        # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
        # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
        strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
                                                                                       input_name=str(
@ -221,7 +221,7 @@ class EmbeddingFunctionHandler(NodeHandler):
        Convert the sharding spec from the logical shape to the physical shape.
        """
        # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
        # we need to map the partition at logical dim 0 to one of the first few dimensions of the input and output
        strategies = _convert_logical_sharding_to_physical_sharding_spec_for_embedding(strategy=strategy,
                                                                                       input_name=str(
--- a/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/linear_handler.py
@ -23,7 +23,7 @@ def _update_sharding_spec_for_transposed_weight_for_linear(strategy: ShardingStr
                                                           weight_name: str) -> ShardingStrategy:
    """
    This function is a helper function used by both module node handler and function node handler. This function will
-    convert the sharding spec for the transposed weight to the correct partititon spec.
+    convert the sharding spec for the transposed weight to the correct partition spec.

    Args:
        strategy (ShardingStrategy): the strategy generated by the strategy generator.
@ -197,7 +197,7 @@ class LinearModuleHandler(MetaInfoModuleHandler):
        strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy, weight_name='weight')

        # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
        # we need to map the partition at dim 0 to one of the first few dimensions of the input
        strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
                                                                                    input_name=str(self.node.args[0]),
@ -267,7 +267,7 @@ class LinearFunctionHandler(MetaInfoNodeHandler):
        strategy = _update_sharding_spec_for_transposed_weight_for_linear(strategy=strategy,
                                                                          weight_name=str(self.node.args[1]))
        # create multiple sharding strategies for the inputs
-        # as input can be multi-dimensinal and the partition dim is only 2D,
+        # as input can be multi-dimensional and the partition dim is only 2D,
        # we need to map the partition at dim 0 to one of the first few dimensions of the input
        strategies = _convert_logical_sharding_to_physical_sharding_spec_for_linear(strategy=strategy,
                                                                                    input_name=str(self.node.args[0]),
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@ -48,8 +48,8 @@ def get_matmul_type(input_dim: int, other_dim: int):
    Determine which type of matmul operation should be executed for the given tensor dimensions.

    Args:
-        input_dim (int): the number of dimensions for the input tenosr
-        other_dim (int): the number of dimensions for the other tenosr
+        input_dim (int): the number of dimensions for the input tensor
+        other_dim (int): the number of dimensions for the other tensor
    """
    if input_dim == 1 and other_dim == 1:
        matmul_type = MatMulType.DOT
@ -268,13 +268,13 @@ class Viewer(BmmTransform):
            dim_partition_dict = sharding_spec.dim_partition_dict
            entire_shape = sharding_spec.entire_shape

-            # upddate the dimension index for the matrix dimensions
+            # update the dimension index for the matrix dimensions
            if 2 in dim_partition_dict:
                dim_partition_dict[len(self.batch_dims_before_view) + 1] = dim_partition_dict.pop(2)
            if 1 in dim_partition_dict:
                dim_partition_dict[len(self.batch_dims_before_view)] = dim_partition_dict.pop(1)

-            # map the logical batch dim to phyiscal batch dim
+            # map the logical batch dim to physical batch dim
            if 0 in dim_partition_dict:
                batch_dim_shard = dim_partition_dict.pop(0)
                dim_partition_dict[physical_batch_dim] = batch_dim_shard
@ -414,7 +414,7 @@ class MatMulHandler(MetaInfoNodeHandler):

    def _get_logical_shape_for_mm(self):
        """
-        We need to handle the input tensor for a matrix-matrix multiplcation as the input
+        We need to handle the input tensor for a matrix-matrix multiplication as the input
        tensor can be a 1D or 2D tensor. If it is a 1D tensor, 1 will be prepended to its shape
        (e.g. [4] -> [1, 4]).
        """
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@ -212,7 +212,7 @@ class NodeHandler(ABC):
        return self.strategies_vector

    def post_process(self, strategy: ShardingStrategy) -> Union[ShardingStrategy, List[ShardingStrategy]]:
-        # tranform the strategy generated
+        # transform the strategy generated
        # e.g. to process the sharding strategy for the transposed weights
        return strategy

--- a/colossalai/auto_parallel/tensor_shard/utils/factory.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py
@ -30,7 +30,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
    """

    if isinstance(input_, Node):
-        assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
+        assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'
        meta_tensor = input_._meta_data
        assert meta_tensor is not None, "The given node's _meta_data attribute is None"
        shape = meta_tensor.shape
--- a/colossalai/auto_parallel/tensor_shard/utils/reshape.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/reshape.py
@ -6,12 +6,12 @@ import torch

 class PreviousStatus(Enum):
    """
-    This class shows the status of previous comparision.
+    This class shows the status of previous comparison.
    """
    RESET = 0
-    # ORIGIN means the dimension size of original tensor is larger in the previous comparision.
+    # ORIGIN means the dimension size of original tensor is larger in the previous comparison.
    ORIGIN = 1
-    # TGT means the dimension size of target tensor is larger in the previous comparision.
+    # TGT means the dimension size of target tensor is larger in the previous comparison.
    TGT = 2


@ -91,7 +91,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
            tgt_index += 1

            if previous_label == PreviousStatus.TGT:
-                # if the target dimension size is larger in the previous comparision, which means
+                # if the target dimension size is larger in the previous comparison, which means
                # the origin dimension size has already accumulated larger than target dimension size, so
                # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
@ -111,7 +111,7 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
            origin_index += 1

            if previous_label == PreviousStatus.ORIGIN:
-                # if the origin element is larger in the previous comparision, which means
+                # if the origin element is larger in the previous comparison, which means
                # the target element has already accumulated larger than origin element, so
                # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
@ -139,7 +139,7 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
    Rule:
        For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
        the function will return false.
-        To illustrate this issue, there are two cases to analyse:
+        To illustrate this issue, there are two cases to analyze:
        1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
        operation without distributed tensor.
        2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@ -13,7 +13,7 @@ from .nvme_optimizer import NVMeOptimizer
 class CPUAdam(NVMeOptimizer):
    """Implements Adam algorithm.

-    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
    But the parameters and gradients should on the same device:
      * Parameters on CPU and gradients on CPU is allowed.
      * Parameters on GPU and gradients on GPU is allowed.
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@ -13,19 +13,19 @@ from .nvme_optimizer import NVMeOptimizer
 class HybridAdam(NVMeOptimizer):
    """Implements Adam algorithm.

-    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
    But the parameters and gradients should on the same device:
      * Parameters on CPU and gradients on CPU is allowed.
      * Parameters on GPU and gradients on GPU is allowed.
      * Parameters on GPU and gradients on CPU is **not** allowed.

-    `HybriadAdam` requires CUDA extensions which can be built during installation or runtime.
+    `HybridAdam` requires CUDA extensions which can be built during installation or runtime.

    This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.

    * For parameters updating on CPU, it uses CPUAdam.
    * For parameters updating on GPU, it uses FusedAdam.
-    * Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
+    * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.

    :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
    or ``torch.optim.Adam`` with ``adamw_mode=False``
@ -131,7 +131,7 @@ class HybridAdam(NVMeOptimizer):
                    assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda"
                    assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda"

-                    # record the state by gruop and update at once
+                    # record the state by group and update at once
                    g_l.append(p.grad.data)
                    p_l.append(p.data)
                    m_l.append(state['exp_avg'])
--- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
+++ b/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py
@ -20,8 +20,8 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
        return
    torch.cuda.current_stream().wait_stream(stream)
    # As mentioned in https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html,
-    # PyTorch uses the "caching allocator" for memroy allocation for tensors. When a tensor is
-    # freed, its memory is likely to be reused by newly constructed tenosrs.  By default,
+    # PyTorch uses the "caching allocator" for memory allocation for tensors. When a tensor is
+    # freed, its memory is likely to be reused by newly constructed tensors.  By default,
    # this allocator traces whether a tensor is still in use by only the CUDA stream where it
    # was created.   When a tensor is used by additional CUDA streams, we need to call record_stream
    # to tell the allocator about all these streams.  Otherwise, the allocator might free the
@ -294,7 +294,7 @@ class CachedParamMgr(torch.nn.Module):
            print(
                f"CPU->CUDA BWD {self._cpu_to_cuda_numel * self.elem_size_in_byte / 1e6 / elapsed} MB/s {self._cpu_to_cuda_numel / 1e6} M elem"
            )
-            print(f'cpu_to_cuda_elpase {elapsed} sec')
+            print(f'cpu_to_cuda_elapse {elapsed} sec')

        for k, v in self._elapsed_dict.items():
            print(f'{k}: {v}')
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@ -324,7 +324,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
    norm_type = float(norm_type)

    # Parameters can be on CPU or CUDA
-    # If parameters are on CPU, disable CUDA kernerls
+    # If parameters are on CPU, disable CUDA kernels

    # Calculate norm.
    if norm_type == inf:
--- a/colossalai/utils/tensor_detector/readme.md
+++ b/colossalai/utils/tensor_detector/readme.md
@ -46,7 +46,7 @@ detector.detect()

 I have made some comments on the right of the output for your understanding.

-Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memery Allocated`.  PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.
+Note that the total `Mem` of all the tensors and parameters is not equal to `Total GPU Memory Allocated`.  PyTorch's memory management is really complicated, and for models of a large scale, it's impossible to figure out clearly.

 **The order of print is not equal to the order the tensor creates, but they are really close.**

@ -61,7 +61,7 @@ Note that the total `Mem` of all the tensors and parameters is not equal to `Tot
 +  mlp.2.bias                        cuda:0               (32,)      True       torch.float32          128 B
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 27
-Totle GPU Memery Allocated on cuda:0 is 4.5 KB
+Total GPU Memory Allocated on cuda:0 is 4.5 KB
 ------------------------------------------------------------------------------------------------------------


@ -72,7 +72,7 @@ Totle GPU Memery Allocated on cuda:0 is 4.5 KB
 +  Tensor                            cuda:0               (32,)      True       torch.float32          128 B    # output
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 30
-Totle GPU Memery Allocated on cuda:0 is 5.5 KB
+Total GPU Memory Allocated on cuda:0 is 5.5 KB
 ------------------------------------------------------------------------------------------------------------


@ -82,7 +82,7 @@ Totle GPU Memery Allocated on cuda:0 is 5.5 KB
 +  Tensor                            cuda:0                  ()      True       torch.float32            4 B    # loss
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 32
-Totle GPU Memery Allocated on cuda:0 is 6.0 KB
+Total GPU Memory Allocated on cuda:0 is 6.0 KB
 ------------------------------------------------------------------------------------------------------------


@ -103,7 +103,7 @@ Totle GPU Memery Allocated on cuda:0 is 6.0 KB
 -  Tensor                            cuda:0                (8,)      True       torch.float32           32 B    # deleted activation
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 34
-Totle GPU Memery Allocated on cuda:0 is 10.0 KB
+Total GPU Memory Allocated on cuda:0 is 10.0 KB
 ------------------------------------------------------------------------------------------------------------


@ -117,7 +117,7 @@ Totle GPU Memery Allocated on cuda:0 is 10.0 KB
 +  Tensor                            cuda:0               (32,)     False       torch.float32          128 B
 ------------------------------------------------------------------------------------------------------------
 Detect Location: "test_tensor_detector.py" line 36
-Totle GPU Memery Allocated on cuda:0 is 14.0 KB
+Total GPU Memory Allocated on cuda:0 is 14.0 KB
 ------------------------------------------------------------------------------------------------------------
 ```

--- a/colossalai/utils/tensor_detector/tensor_detector.py
+++ b/colossalai/utils/tensor_detector/tensor_detector.py
@ -55,7 +55,7 @@ class TensorDetector():
        return self.mem_format(memory_size)

    def mem_format(self, real_memory_size):
-        # format the tensor memory into a reasonal magnitude
+        # format the tensor memory into a reasonable magnitude
        if real_memory_size >= 2**30:
            return str(real_memory_size / (2**30)) + ' GB'
        if real_memory_size >= 2**20:
@ -71,7 +71,7 @@ class TensorDetector():
                if (not self.include_cpu) and obj.device == torch.device('cpu'):
                    continue
                self.detected.append(id(obj))
-                # skip paramters we had added in __init__ when module is an instance of nn.Module for the first epoch
+                # skip parameters we had added in __init__ when module is an instance of nn.Module for the first epoch
                if id(obj) not in self.tensor_info:

                    name = type(obj).__name__
@ -84,7 +84,7 @@ class TensorDetector():
                                    name = par_name + ' (with grad)'
                        else:
                            # with no grad attached
-                            # there will be no new paramters created during running
+                            # there will be no new parameters created during running
                            # so it must be in saved_tensor_info
                            continue
                    # we can also marked common tensors as tensor(with grad)
@ -155,7 +155,7 @@ class TensorDetector():
            if device == torch.device('cpu'):
                continue
            gpu_mem_alloc = self.mem_format(torch.cuda.memory_allocated(device))
-            self.info += f"Totle GPU Memery Allocated on {device} is {gpu_mem_alloc}\n"
+            self.info += f"Total GPU Memory Allocated on {device} is {gpu_mem_alloc}\n"
        self.info += LINE
        self.info += '\n\n'
        if self.show_info:
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@ -102,7 +102,7 @@ class ChunkManager:
        """
        if chunk in self.accessed_chunks:
            return
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
        if chunk.device_type == 'cpu':
            chunk.shard_move(get_current_device())
        self.__add_accessed_chunk(chunk)
@ -114,7 +114,7 @@ class ChunkManager:
        if chunk not in self.accessed_chunks:
            return
        if chunk.can_release:
-            self.__sub_memroy_usage(chunk.memory_usage)
+            self.__sub_memory_usage(chunk.memory_usage)
            self.__sub_accessed_chunk(chunk)
            self.__add_memory_usage(chunk.memory_usage)

@ -123,7 +123,7 @@ class ChunkManager:
        """
        if not chunk.can_move or chunk.device_type == device.type:
            return
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
        chunk.shard_move(device, force_copy)
        self.__add_memory_usage(chunk.memory_usage)

@ -138,7 +138,7 @@ class ChunkManager:
        """
        if not chunk.can_reduce:
            return False
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
        chunk.reduce()
        self.__sub_accessed_chunk(chunk)
        self.__add_memory_usage(chunk.memory_usage)
@ -228,11 +228,11 @@ class ChunkManager:
        return self.chunk_groups[group_name]

    def __close_one_chunk(self, chunk: Chunk):
-        self.__sub_memroy_usage(chunk.memory_usage)
+        self.__sub_memory_usage(chunk.memory_usage)
        chunk.close_chunk()
        self.__add_memory_usage(chunk.memory_usage)

-    def __sub_memroy_usage(self, usage: Dict[str, int]):
+    def __sub_memory_usage(self, usage: Dict[str, int]):
        for k, v in usage.items():
            self.total_mem[k] -= v

--- a/colossalai/zero/gemini/chunk/search_utils.py
+++ b/colossalai/zero/gemini/chunk/search_utils.py
@ -85,7 +85,7 @@ def classify_params_by_dp_degree(param_order: OrderedParamGenerator,
    Classify the parameters by their dp degree

    Args:
-        param_order (OrderedParamGenerator): the order of param be visied
+        param_order (OrderedParamGenerator): the order of param be vised
        strict_ddp_flag (bool, optional): whether to enable the strict ddp mode. Defaults to False.

    Returns:
--- a/colossalai/zero/gemini/memory_tracer/memory_stats.py
+++ b/colossalai/zero/gemini/memory_tracer/memory_stats.py
@ -59,7 +59,7 @@ class MemStats(object):
        time step.

        Args:
-            param_list (List[torch.nn.Parameter]): a list of torch paramters.
+            param_list (List[torch.nn.Parameter]): a list of torch parameters.
        """
        for p in param_list:
            if p not in self._param_step_dict: