From bce9499ed33f7a8359bbb568c7ee18d72e8aa731 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Thu, 25 Jan 2024 13:56:27 +0800
Subject: [PATCH] fix some typo (#5307)

---
 colossalai/moe/manager.py                               | 2 +-
 colossalai/tensor/comm_spec.py                          | 2 +-
 colossalai/tensor/d_tensor/api.py                       | 4 ++--
 colossalai/tensor/moe_tensor/api.py                     | 2 +-
 colossalai/tensor/moe_tensor/moe_info.py                | 2 +-
 colossalai/utils/timer.py                               | 2 +-
 colossalai/zero/gemini/gemini_optimizer.py              | 8 ++++----
 colossalai/zero/low_level/bookkeeping/bucket_store.py   | 6 +++---
 colossalai/zero/low_level/bookkeeping/gradient_store.py | 2 +-
 colossalai/zero/low_level/low_level_optim.py            | 6 +++---
 docs/source/en/basics/booster_api.md                    | 2 +-
 examples/tutorial/auto_parallel/README.md               | 2 +-
 12 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/colossalai/moe/manager.py b/colossalai/moe/manager.py
index 3e64d796c..eaca75b8f 100644
--- a/colossalai/moe/manager.py
+++ b/colossalai/moe/manager.py
@@ -69,7 +69,7 @@ class MoEManager(metaclass=SingletonMeta):
             fixed_dp_size (int, optional): Fixed dp size in fixed mode. Defaults to 0.
             fixed_ep_size (int, optional): Fixed ep size in fixed mode. Defaults to 0.
             fixed_pp_size (int, optional): Fixed pp size in fixed mode. Defaults to 0.
-            use_ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. Defaults to True.
+            use_ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if False. Defaults to True.
         """
         assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
         assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
diff --git a/colossalai/tensor/comm_spec.py b/colossalai/tensor/comm_spec.py
index de0cba26b..27afac9e9 100644
--- a/colossalai/tensor/comm_spec.py
+++ b/colossalai/tensor/comm_spec.py
@@ -451,7 +451,7 @@ class CommSpec:
         elif self.comm_pattern == CollectiveCommPattern.MIXGATHER_FWD_SPLIT_BWD:
             res_list.append(f"comm_pattern:MIXGATHER_FWD_SPLIT_BWD, ")
             res_list.append(f"gather_dim:{self.gather_dim}, ")
-            res_list.append(f"logical_process_asex:{self.logical_process_axes})")
+            res_list.append(f"logical_process_axes:{self.logical_process_axes})")
 
         return "".join(res_list)
 
diff --git a/colossalai/tensor/d_tensor/api.py b/colossalai/tensor/d_tensor/api.py
index 74a785f2d..da6ef275e 100644
--- a/colossalai/tensor/d_tensor/api.py
+++ b/colossalai/tensor/d_tensor/api.py
@@ -96,9 +96,9 @@ def _apply_layout(tensor, layout):
     """
     Apply the layout to the local tensor during initializing process.
     """
-    # layout converter requires a source and target laytout
+    # layout converter requires a source and target layout
     # we construct the source layer for an unsharded tensor
-    # and use self.dist_layer as the targer layout for the sharded tensor
+    # and use self.dist_layer as the target layout for the sharded tensor
     source_spec = _construct_default_sharding_spec(tensor)
     source_layout = Layout(device_mesh=layout.device_mesh, sharding_spec=source_spec, global_shape=tensor.shape)
     sharded_tensor = layout_converter.apply(tensor=tensor, source_layout=source_layout, target_layout=layout)
diff --git a/colossalai/tensor/moe_tensor/api.py b/colossalai/tensor/moe_tensor/api.py
index 1e4486101..b6843df7a 100644
--- a/colossalai/tensor/moe_tensor/api.py
+++ b/colossalai/tensor/moe_tensor/api.py
@@ -40,7 +40,7 @@ def get_moe_info(ep_size: int, dp_size: int, pp_size: int, ep_inside: bool) -> M
         ep_size (int): The expert parallel size.
         dp_size (int): The data parallel size.
         pp_size (int): The pipeline parallel size.
-        ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle.
+        ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if False.
 
     Returns:
         dict: The moe info of the given tensor.
diff --git a/colossalai/tensor/moe_tensor/moe_info.py b/colossalai/tensor/moe_tensor/moe_info.py
index 5097ac104..ba6c77056 100644
--- a/colossalai/tensor/moe_tensor/moe_info.py
+++ b/colossalai/tensor/moe_tensor/moe_info.py
@@ -12,7 +12,7 @@ class MoeParallelInfo:
             ep_size (int): expert parallel size
             dp_size (int): data parallel (zero) size
             pp_size (int, optional): pipeline parallel size. Defaults to 1.
-            ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. Defaults to True.
+            ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if False. Defaults to True.
         """
         self.pp_size, self.dp_size, self.ep_size = pp_size, dp_size, ep_size
         if ep_inside:
diff --git a/colossalai/utils/timer.py b/colossalai/utils/timer.py
index 8ab6b46f2..2f7ccc24c 100644
--- a/colossalai/utils/timer.py
+++ b/colossalai/utils/timer.py
@@ -123,7 +123,7 @@ class MultiTimer:
             return None
 
     def get_timer(self, name):
-        """Get timer by its name (from multitimer)
+        """Get timer by its name (from multimer)
 
         Args:
             name (str): Timer's key.
diff --git a/colossalai/zero/gemini/gemini_optimizer.py b/colossalai/zero/gemini/gemini_optimizer.py
index 8f828bd6c..ad9459339 100644
--- a/colossalai/zero/gemini/gemini_optimizer.py
+++ b/colossalai/zero/gemini/gemini_optimizer.py
@@ -413,7 +413,7 @@ class GeminiOptimizer(OptimizerWrapper):
             only_rank_0(bool): if True, states will be collected only on master rank, otherwise collected on every rank.
 
         Returns:
-            collected_states(dict): the gathered optimzier state of parameter with given id
+            collected_states(dict): the gathered optimizer state of parameter with given id
                                     if this method is called by master rank, otherwise an empty dict.
 
         This method can work only when called by all processes simultaneously.
@@ -461,7 +461,7 @@ class GeminiOptimizer(OptimizerWrapper):
         global_shape = self.optimizer_params_info["id2shape"][param_id]
 
         # If the chunk is kept gathered,
-        # the parameteres are treated the same as that of those in strict DDP during training.
+        # the parameters are treated the same as that of those in strict DDP during training.
         # So states can be directly fetched from current device.
         if chunk.keep_gathered:
             assert param_id in self.id_to_fake_params
@@ -644,7 +644,7 @@ class GeminiOptimizer(OptimizerWrapper):
         """
         Args:
             only_rank_0 (bool): a boolean value indicating whether the state_dict is collected
-            only on rank 0, dafault to True.
+            only on rank 0, default to True.
 
         Returns:
             The complete state of the optimizer as a :class:`dict`.
@@ -783,7 +783,7 @@ class GeminiOptimizer(OptimizerWrapper):
             prefix (str, optional): the prefix for states. Default to ''.
             max_shard_size (int, optional): max size of state dict shard (in MB). Defaults to 1024.
             only_rank_0 (bool, optional): a boolean value indicating whether the state_dict is collected
-                                          only on rank 0, dafault to True.
+                                          only on rank 0, default to True.
 
         Yields:
             Iterator[OrderedDict]: A generator of state dict shard of optimizer states.
diff --git a/colossalai/zero/low_level/bookkeeping/bucket_store.py b/colossalai/zero/low_level/bookkeeping/bucket_store.py
index 2828d5175..f395fc60e 100644
--- a/colossalai/zero/low_level/bookkeeping/bucket_store.py
+++ b/colossalai/zero/low_level/bookkeeping/bucket_store.py
@@ -15,7 +15,7 @@ class BucketStore(BaseStore):
         # init
         self.current_group_id = 0
         self._num_elements_in_bucket = 0
-        # mapping gardient slices and parameter
+        # mapping gradient slices and parameter
         self.grad_to_param_mapping = dict()
 
         self._grad_in_bucket = dict()
@@ -59,7 +59,7 @@ class BucketStore(BaseStore):
         self.offset_list[-1] += 1
 
     def build_grad_in_bucket(self):
-        """Orgnize parameters' gradient(padding and split), follows the paramters' splitting method
+        """Organize parameters' gradient(padding and split), follows the parameters' splitting method
 
         Data structure of self._grad_in_bucket:
         {
@@ -91,7 +91,7 @@ class BucketStore(BaseStore):
         return self._grad_in_bucket
 
     def get_flatten_grad(self) -> Tensor:
-        """Return the flattened gradients slices in the bucket, the data orginization of the flattened tensor:
+        """Return the flattened gradients slices in the bucket, the data organization of the flattened tensor:
         [grad0_rank0, grad1_rank0, ..., grad_0_rank1, grad1_rank1, ....]
 
         Returns:
diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py
index 1164532fa..73a1db5a0 100644
--- a/colossalai/zero/low_level/bookkeeping/gradient_store.py
+++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py
@@ -9,7 +9,7 @@ class GradientStore(BaseStore):
     def __init__(self, *args, partition_grad: bool = False):
         super().__init__(*args)
         """
-        self._grads_of_params mapping the paramater and its gradient slices
+        self._grads_of_params mapping the parameter and its gradient slices
         data structure:
         {
          group_id:{
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index c1b35ee17..891cae65a 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -171,7 +171,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
             # managed by this data parallel rank
             param_group["params"] = master_param_current_rank
 
-        # if there are moe params, store in addtional group in optim
+        # if there are moe params, store in additional group in optim
         if len(moe_params) > 0:
             param_group = dict()
             for key, value in self.optim.param_groups[0].items():
@@ -180,8 +180,8 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
             param_group["params"] = moe_params
             self.optim.param_groups.append(param_group)
 
-        # intialize communication stream for
-        # communication-compuation overlapping
+        # initialize communication stream for
+        # communication-computation overlapping
         if self._overlap_communication:
             self._comm_stream = device_utils.Stream()
 
diff --git a/docs/source/en/basics/booster_api.md b/docs/source/en/basics/booster_api.md
index 4d7ffe5a4..2c75dd9ac 100644
--- a/docs/source/en/basics/booster_api.md
+++ b/docs/source/en/basics/booster_api.md
@@ -32,7 +32,7 @@ Plugin is an important component that manages parallel configuration (eg: The ge
 
 More details about usages of each plugin can be found in chapter [Booster Plugins](./booster_plugins.md).
 
-Some plugins support lazy initialization, which can be used to save memory when initializating large models. For more details, please see [Lazy Initialization](../features/lazy_init.md).
+Some plugins support lazy initialization, which can be used to save memory when initializing large models. For more details, please see [Lazy Initialization](../features/lazy_init.md).
 
 ### API of booster
 
diff --git a/examples/tutorial/auto_parallel/README.md b/examples/tutorial/auto_parallel/README.md
index 135615676..6f11298fc 100644
--- a/examples/tutorial/auto_parallel/README.md
+++ b/examples/tutorial/auto_parallel/README.md
@@ -49,7 +49,7 @@ You should expect to the log like this. This log shows the edge cost on the comp
 
 ### Auto-Checkpoint Tutorial
 
-We prepare two bechmarks for you to test the performance of auto checkpoint
+We prepare two benchmarks for you to test the performance of auto checkpoint
 
 The first test `auto_ckpt_solver_test.py` will show you the ability of solver to search checkpoint strategy that could fit in the given budget (test on GPT2 Medium and ResNet 50). It will output the benchmark summary and data visualization of peak memory vs. budget memory and relative step time vs. peak memory.