[nfc]fix typo colossalai/pipeline tensor nn (#3899)

* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped * fix typo colossalai/pipeline tensor nn
2023-06-06 14:07:36 +08:00 · 2023-06-06 14:07:36 +08:00 · 0e484e6201
parent c1535ccbba
commit 0e484e6201
13 changed files with 27 additions and 27 deletions
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@ -13,7 +13,7 @@ from .nvme_optimizer import NVMeOptimizer
 class CPUAdam(NVMeOptimizer):
    """Implements Adam algorithm.

-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
    But the parameters and gradients should on the same device:
      * Parameters on CPU and gradients on CPU is allowed.
      * Parameters on GPU and gradients on GPU is allowed.
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@ -14,7 +14,7 @@ from .cpu_adam import CPUAdam
 class HybridAdam(CPUAdam):
    """Implements Adam algorithm.

-    Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
+    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
    But the parameters and gradients should on the same device:
      * Parameters on CPU and gradients on CPU is allowed.
      * Parameters on GPU and gradients on GPU is allowed.
--- a/colossalai/pipeline/pipelinable.py
+++ b/colossalai/pipeline/pipelinable.py
@ -83,7 +83,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
        for k, v in kwargs.items():
            if isinstance(v, torch.nn.Module):
                v = self._layer_spec_dict[id(v)]
-            # (lyl)TODO: analyse ColoTensor as well
+            # (lyl)TODO: analyze ColoTensor as well
            modified_kwargs[k] = v

        # keep track of the module children
@ -117,7 +117,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
    def to_layer_list(self, exec_seq=None):
        """
        Create a layer spec list and func list with execution sequence given by user.
-        If exec_seq is None, we will take the module initizing order as execution order.
+        If exec_seq is None, we will take the module initializing order as execution order.
        """

        self._exec_seq = exec_seq
@ -177,7 +177,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):

    def partition(self, num_chunks, pipeline_size, rank):
        """
-        Partitioned model will be built respect to partion policy.
+        Partitioned model will be built respect to partition policy.
        The real module instance will be built in this method.
        """
        if isinstance(self._policy, str):
@ -193,7 +193,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
                self.customized_parts = customized_partition(self._exec_seq)
                assert len(self.customized_parts) == gpc.get_world_size(
                    ParallelMode.PIPELINE
-                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partions is {len(self.customized_parts)}'
+                ), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
                parts = self.customized_parts[rank]
            else:
                raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")
--- a/colossalai/pipeline/rpc/_pipeline_base.py
+++ b/colossalai/pipeline/rpc/_pipeline_base.py
@ -123,7 +123,7 @@ class WorkerBase(ABC):
        self.device = device
        self._initialize_outstanding_range()

-        # variable and const for context managment
+        # variable and const for context management
        self.outstanding = 0
        self.forward_times = 0
        self.backward_times = 0
@ -226,7 +226,7 @@ class WorkerBase(ABC):
        self.pp_rank_to_worker_rref = pp_rank_to_worker_rref

        # for some schedule need the other worker's info to initialise partition (like Chimera)
-        # construction of partition is executed after the registion of pp_rank_to_worker_rref
+        # construction of partition is executed after the registration of pp_rank_to_worker_rref
        self._initialize_partition()

    # res_use works for lifecycle counter,
@ -418,7 +418,7 @@ class WorkerBase(ABC):
                # On current PP middleware design for DAG, get_output_by_key used by _subscribe_producer
                # can only be executed once for every producer-consumer stage pair, which is necessary
                # to count the lifecycle of work_item. So, keeping the _subscribe_producer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                work_item_from_producer = self._subscribe_producer(microbatch_id, forward_only)
                self.work_list[key] = work_item_from_producer
                self.work_list_condition_lock.notify_all()
@ -460,7 +460,7 @@ class WorkerBase(ABC):
                # On current PP middleware design for DAG, get_output_by_key used by subscribe_consumer
                # can only be executed once for every producer-consumer stage pair, which is necessary
                # to count the lifecycle of work_item. So, keeping the subscribe_consumer in the same
-                # lock of work_item queue operation gurantees the consistency of lifecycle counter.
+                # lock of work_item queue operation guarantees the consistency of lifecycle counter.
                work_item_from_consumer = self._subscribe_consumer(microbatch_id)
                self.work_list[key] = work_item_from_consumer
                self.work_list_condition_lock.notify_all()
@ -508,7 +508,7 @@ class WorkerBase(ABC):
        assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
        assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"

-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
        self.producer_stage_ids = self.get_producer_stage_ids()
        self.consumer_stage_ids = self.get_consumer_stage_ids()

--- a/colossalai/pipeline/rpc/_pipeline_schedule.py
+++ b/colossalai/pipeline/rpc/_pipeline_schedule.py
@ -123,7 +123,7 @@ class ChimeraWorker(WorkerBase):
        assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
        assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"

-        # should be aranged in order, the order of the input of current forward
+        # should be arranged in order, the order of the input of current forward
        self.producer_stage_ids = []
        self.consumer_stage_ids = []

@ -174,7 +174,7 @@ class ChimeraWorker(WorkerBase):
        else:
            # if it is down pipeline, create partition by origin method
            co_up_pp_worker_rref = self.pp_rank_to_worker_rref[pp_rank - stage_num]
-            # get the coresponding model state dict and wait for its init
+            # get the corresponding model state dict and wait for its init
            state_dict = co_up_pp_worker_rref.rpc_sync().get_partition_state_dict()
            super()._initialize_partition()
            self.module_partition.load_state_dict(state_dict)
@ -228,7 +228,7 @@ class ChimeraWorker(WorkerBase):
        stage_num = self.actual_stage_num
        co_pp_rank = (pp_rank + stage_num) % (2 * stage_num)

-        # if currrent pp_rank is not the first to do step
+        # if current pp_rank is not the first to do step
        # wait its previous pp_rank finish step
        grads = self.get_parameter_gradients()

--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@ -113,7 +113,7 @@ def _binary_search(weights, num):

 def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
    assert num_items % num_chunks == 0, \
-        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recommended"

    logger = get_dist_logger()
    parts = [[] for _ in range(pipeline_parallel_size)]
--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@ -28,7 +28,7 @@ class CommSpec:
    to determine the buffer shape, and logical_process_axis

    Argument:
-        comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
+        comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
        process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
        gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
        shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@ -41,7 +41,7 @@ class DimSpec:

    def _convert_str_to_shard_list(self, str_spec):
        '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.

        Argument:
            str_spec(str): dim spec in str type.
@ -58,7 +58,7 @@ class DimSpec:

    def build_difference_2d_dict(self):
        '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
        compute the difference between DimSpec pairs.
        '''

--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@ -164,7 +164,7 @@ def _get_grad_args(*args):
    for obj in args:
        if _is_grad_tensor(obj):
            return args, None
-    # otherwise, the first arguement should be a tuple of grad tensors
+    # otherwise, the first argument should be a tuple of grad tensors
    # if there is no grad tensor, the backward of PreFwdPostBwd can't be triggered
    arg_zero = args[0]
    if not isinstance(arg_zero, tuple):
--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
@ -130,7 +130,7 @@ class ProcessGroup:
    @property
    def has_cpu_groups(self) -> bool:
        """has_cpu_groups
-        If cpu groups have been initailized.
+        If cpu groups have been initialized.

        Returns:
            bool: cpu process groups have been initialized or not.
--- a/colossalai/tensor/shape_consistency.py
+++ b/colossalai/tensor/shape_consistency.py
@ -252,7 +252,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
    def get_all_shard_spec(self, source_spec: ShardingSpec, orig_cost_dict):
        '''
        Get all valid sharding specs from source_spec with single shard operation, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
        For the sharding operation, we just care about legal sharding dimensions.

        Argument:
@ -386,7 +386,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
    def get_all_one_step_transform_spec(self, source_spec: ShardingSpec, orig_cost_dict) -> Dict[ShardingSpec, float]:
        '''
        Get all valid sharding specs from source_spec with one step transform, and
-        accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
+        accumulate communication cost on origin cost which will finally be used in auto sharding solver.
        Note:
            all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
            and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
@ -577,7 +577,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
        Step3:
            Repeat above steps until the source spec transform to target spec.

-        During finding the transform path, commucation cost will be accumulated, and it
+        During finding the transform path, communication cost will be accumulated, and it
        will be finally used in auto parallel solver.

        Additionally, to avoid repeating the path search in runtime, we cached all solved path
--- a/colossalai/tensor/sharding_spec.py
+++ b/colossalai/tensor/sharding_spec.py
@ -45,7 +45,7 @@ class _DimSpec:

    def _convert_str_to_shard_list(self, str_spec):
        '''
-        Conver str_spec into shard_list.
+        Convert str_spec into shard_list.

        Argument:
            str_spec(str): dim spec in str type.
@ -62,7 +62,7 @@ class _DimSpec:

    def build_difference_2d_dict(self):
        '''
-        Build a difference maping for 2D device mesh case. It will be used to
+        Build a difference mapping for 2D device mesh case. It will be used to
        compute the difference between DimSpec pairs.
        '''

@ -166,7 +166,7 @@ class ShardingSpec:
        device_mesh(DeviceMesh): A logical view of a physical mesh.
        entire_shape(torch.Size): The entire shape of tensor before sharded.
        dim_partition_dict(Dict[int, List[int]]， optional): The key is the dimension of tensor to be sharded,
-            and the value of the key decribe which logical axis will be sharded in that dimension.
+            and the value of the key describe which logical axis will be sharded in that dimension.
        sharding_sequence(List[_DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
    '''

--- a/colossalai/tensor/utils.py
+++ b/colossalai/tensor/utils.py
@ -77,7 +77,7 @@ def shard_simulator(target_pair, legal_sharding_dims):

    Argument:
        target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
-        and the second element decribes which logical axis will be sharded in that dimension.
+        and the second element describes which logical axis will be sharded in that dimension.
    '''
    _, shard_list = target_pair
    shard_list_list = []