From ad6460cf2c4d63ea91dc4dc90431f92567dad303 Mon Sep 17 00:00:00 2001
From: digger-yu <digger-yu@outlook.com>
Date: Mon, 15 May 2023 11:46:25 +0800
Subject: [PATCH] [NFC] fix typo applications/ and colossalai/ (#3735)

---
 applications/Chat/examples/community/peft/README.md       | 2 +-
 applications/Chat/inference/README.md                     | 2 +-
 applications/Chat/inference/benchmark.py                  | 2 +-
 colossalai/auto_parallel/README.md                        | 4 ++--
 .../auto_parallel/passes/runtime_preparation_pass.py      | 4 ++--
 colossalai/autochunk/autochunk_codegen.py                 | 6 +++---
 colossalai/autochunk/trace_indice.py                      | 2 +-
 colossalai/checkpoint_io/index_file.py                    | 2 +-
 colossalai/checkpoint_io/utils.py                         | 2 +-
 colossalai/cli/check/check_installation.py                | 8 ++++----
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/applications/Chat/examples/community/peft/README.md b/applications/Chat/examples/community/peft/README.md
index eabb56fd8..844bfd3d2 100644
--- a/applications/Chat/examples/community/peft/README.md
+++ b/applications/Chat/examples/community/peft/README.md
@@ -18,7 +18,7 @@ For SFT training, just call train_peft_sft.py
 Its arguments are almost identical to train_sft.py instead adding a new eval_dataset if you have a eval_dataset file. The data file is just a plain datafile, please check the format in the easy_dataset.py.
 
 For stage-3 rlhf training, call train_peft_prompts.py.
-Its arguments are almost idential to train_prompts.py. The only difference is that I use text files to indicate the prompt and pretrained data file. The models are included in easy_models.py. Currently only bloom models are tested, but technically gpt2/opt/llama should be supported.
+Its arguments are almost identical to train_prompts.py. The only difference is that I use text files to indicate the prompt and pretrained data file. The models are included in easy_models.py. Currently only bloom models are tested, but technically gpt2/opt/llama should be supported.
 
 # Dataformat
 Please refer the formats in test_sft.txt, test_prompts.txt, test_pretrained.txt.
diff --git a/applications/Chat/inference/README.md b/applications/Chat/inference/README.md
index 434677c98..4848817e0 100644
--- a/applications/Chat/inference/README.md
+++ b/applications/Chat/inference/README.md
@@ -75,7 +75,7 @@ E.g. you can set `export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`.
 
 Please ensure you have downloaded HF-format model weights of LLaMA models first.
 
-Then you can follow [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). This lib provides efficient CUDA kernels and weight convertion script.
+Then you can follow [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). This lib provides efficient CUDA kernels and weight conversion script.
 
 After installing this lib, we may convert the original HF-format LLaMA model weights to 4-bit version.
 
diff --git a/applications/Chat/inference/benchmark.py b/applications/Chat/inference/benchmark.py
index 59cd1eeea..a8485f588 100644
--- a/applications/Chat/inference/benchmark.py
+++ b/applications/Chat/inference/benchmark.py
@@ -123,7 +123,7 @@ if __name__ == "__main__":
     start = time()
     for instruction in instructions:
         print(f"Instruction: {instruction}")
-        resp, tokens = evaluate(model, tokenizer, instruction, temparature=0.2, num_beams=1)
+        resp, tokens = evaluate(model, tokenizer, instruction, temperature=0.2, num_beams=1)
         total_tokens += tokens
         print(f"Response: {resp}")
         print('\n----------------------------\n')
diff --git a/colossalai/auto_parallel/README.md b/colossalai/auto_parallel/README.md
index 8e47e1bb0..f011ec8cc 100644
--- a/colossalai/auto_parallel/README.md
+++ b/colossalai/auto_parallel/README.md
@@ -16,8 +16,8 @@ A *symbolic profiler* for collecting computing and memory overhead related to st
 ### Solver
 
 **Solver** is designed to find the optimal execution plan for a given computation graph and cluster in two stages:
-1) *Intra-op parallelism stage* is to find the plan with the minimum total execution time of all nodes with respect to the constraint of the memory budget. The optimaztion goal of intra-op parallelism solver is modified from <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>'s intra-op parallelsim ILP solver.
-2) *Activation checkpoint stage* is to search for the fastest execution plan that meets the memory budget on the computation graph after inserting the communication nodes by the intra-op parallelism stage. The algorithm to find optimial activation checkpoint is modified from <a href="https://hal.inria.fr/hal-02352969"> Rotor </a>. The reason we use two-stage optimization is that if the two tasks are formulated together, the solving time will be significantly increased, which will greatly affect the user experience of the system. On the contrary, solving in two hierarchical levels has many advantages. Firstly, compared with the computation graph with activation checkpointing, the original graph has fewer nodes, which can reduce the solving cost of intra-op parallelism solver. In addition, a more optimal solution can be found by adding the communication overhead into the activation checkpoint modeling.
+1) *Intra-op parallelism stage* is to find the plan with the minimum total execution time of all nodes with respect to the constraint of the memory budget. The optimization goal of intra-op parallelism solver is modified from <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>'s intra-op parallelism ILP solver.
+2) *Activation checkpoint stage* is to search for the fastest execution plan that meets the memory budget on the computation graph after inserting the communication nodes by the intra-op parallelism stage. The algorithm to find optimal activation checkpoint is modified from <a href="https://hal.inria.fr/hal-02352969"> Rotor </a>. The reason we use two-stage optimization is that if the two tasks are formulated together, the solving time will be significantly increased, which will greatly affect the user experience of the system. On the contrary, solving in two hierarchical levels has many advantages. Firstly, compared with the computation graph with activation checkpointing, the original graph has fewer nodes, which can reduce the solving cost of intra-op parallelism solver. In addition, a more optimal solution can be found by adding the communication overhead into the activation checkpoint modeling.
 
 ### Generator
 **Generator** applies the searched execution plan to the computation graph and recompiles the computation graph to optimized PyTorch code. It has *a series compile pass* to insert a communication node or do the kernel substitution as the intra-op parallelism solver required. Additionally, we implement a *code generation* feature to recognize the annotation from the activation checkpoint solver and inject the activation checkpoint block following annotation instructions.
diff --git a/colossalai/auto_parallel/passes/runtime_preparation_pass.py b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
index 08af846b2..177f3765f 100644
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
@@ -169,7 +169,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh
         This function is used to process the dependency between the size node and its users after
         inserting the size_process_node.
         '''
-        # store original node and processing node pair in node_pairs dictioanry
+        # store original node and processing node pair in node_pairs dictionary
         # It will be used to replace the original node with processing node in slice object
         node_pairs[node] = size_processing_node
         size_processing_node._meta_data = node._meta_data
@@ -388,7 +388,7 @@ def module_params_sharding_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMes
     """
     mod_graph = gm.graph
     nodes = tuple(mod_graph.nodes)
-    # This stream is created for overlaping the communication and computation.
+    # This stream is created for overlapping the communication and computation.
     reduction_stream = torch.cuda.Stream()
 
     def _add_hook_for_grad_communication(node, param, name=None):
diff --git a/colossalai/autochunk/autochunk_codegen.py b/colossalai/autochunk/autochunk_codegen.py
index d0a467254..cc98c1570 100644
--- a/colossalai/autochunk/autochunk_codegen.py
+++ b/colossalai/autochunk/autochunk_codegen.py
@@ -40,7 +40,7 @@ def _gen_chunk_slice_dim(chunk_dim: int, chunk_indice_name: str, shape: List) ->
     return new_shape
 
 
-def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_ouput_dim: int, chunk_size=2) -> str:
+def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_output_dim: int, chunk_size=2) -> str:
     """
     Generate chunk loop start
 
@@ -52,7 +52,7 @@ def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_oup
     Args:
         chunk_input (List[Node]): chunk input node
         chunk_output (Node): chunk output node
-        chunk_ouput_dim (int): chunk output node chunk dim
+        chunk_output_dim (int): chunk output node chunk dim
         chunk_size (int): chunk size. Defaults to 2.
 
     Returns:
@@ -74,7 +74,7 @@ def _gen_loop_start(chunk_input: List[Node], chunk_output: List[Node], chunk_oup
                                                                                       input_node.name, input_node.name)
 
     out_shape = get_node_shape(chunk_output[0])
-    chunk_shape = out_shape[chunk_ouput_dim[0]]
+    chunk_shape = out_shape[chunk_output_dim[0]]
     context += "chunk_size = %d\nfor chunk_idx in range(0, %d, chunk_size):\n" % (chunk_size, chunk_shape)
     return context
 
diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py
index c7fce4c8b..d56bf843f 100644
--- a/colossalai/autochunk/trace_indice.py
+++ b/colossalai/autochunk/trace_indice.py
@@ -18,7 +18,7 @@ class TraceIndice(object):
         dim(x1)=dim(x2)=dim(x3)=[a, b, c]
     This class will record every node's dims' indice, compute and source.
 
-    Attibutes:
+    Attributes:
         node_list (List)
         indice_trace_list (List): [{"indice": [...], "compute": [...], "source": [...]}, {...}]
         indice_view_list (Dict): not used for now
diff --git a/colossalai/checkpoint_io/index_file.py b/colossalai/checkpoint_io/index_file.py
index 15a6d09f3..334ecbc04 100644
--- a/colossalai/checkpoint_io/index_file.py
+++ b/colossalai/checkpoint_io/index_file.py
@@ -159,7 +159,7 @@ class CheckpointIndexFile:
     
     def write_index_file(self, save_index_file):
         """
-        Wriete index file.
+        Write index file.
         """
         save_index_file = os.path.join(self.root_path, save_index_file)
         index = {"metadata": self.metadata, "weight_map": self.weight_map}
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index 16e41631f..ee4bd72e8 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -21,7 +21,7 @@ def calculate_tensor_size(tensor: torch.Tensor) -> float:
     If so, a new shard should be created.
 
     Args:
-        tenosr (torch.Tensor): the tensor to calculate size for.
+        tensor (torch.Tensor): the tensor to calculate size for.
 
     Returns:
         float: size of the tensor in MB.
diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py
index cb3dbbc09..4a481f3bd 100644
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
@@ -31,7 +31,7 @@ def check_installation():
     found_aot_cuda_ext = _check_aot_built_cuda_extension_installed()
     cuda_version = _check_cuda_version()
     torch_version, torch_cuda_version = _check_torch_version()
-    colossalai_verison, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version()
+    colossalai_version, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version()
 
     # if cuda_version is None, that means either
     # CUDA_HOME is not found, thus cannot compare the version compatibility
@@ -57,7 +57,7 @@ def check_installation():
 
     click.echo(f'#### Installation Report ####')
     click.echo(f'\n------------ Environment ------------')
-    click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}")
+    click.echo(f"Colossal-AI version: {to_click_output(colossalai_version)}")
     click.echo(f"PyTorch version: {to_click_output(torch_version)}")
     click.echo(f"System CUDA version: {to_click_output(cuda_version)}")
     click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}")
@@ -137,7 +137,7 @@ def _parse_colossalai_version():
     # 1. X.X.X+torchX.XXcuXX.X (when colossalai is installed with CUDA extensions)
     # 2. X.X.X (when colossalai is not installed with CUDA extensions)
     # where X represents an integer.
-    colossalai_verison = colossalai.__version__.split('+')[0]
+    colossalai_version = colossalai.__version__.split('+')[0]
 
     try:
         torch_version_for_aot_build = colossalai.__version__.split('torch')[1].split('cu')[0]
@@ -145,7 +145,7 @@ def _parse_colossalai_version():
     except:
         torch_version_for_aot_build = None
         cuda_version_for_aot_build = None
-    return colossalai_verison, torch_version_for_aot_build, cuda_version_for_aot_build
+    return colossalai_version, torch_version_for_aot_build, cuda_version_for_aot_build
 
 
 def _check_aot_built_cuda_extension_installed():