[nfc]fix typo colossalai/pipeline tensor nn (#3899)

* fix typo colossalai/autochunk auto_parallel amp

* fix typo colossalai/auto_parallel nn utils etc.

* fix typo colossalai/auto_parallel autochunk fx/passes  etc.

* fix typo docs/

* change placememt_policy to placement_policy in docs/ and examples/

* fix typo colossalai/ applications/

* fix typo colossalai/cli fx kernel

* fix typo colossalai/nn

* revert change warmuped

* fix typo colossalai/pipeline tensor nn
pull/3908/head
digger yu 2023-06-06 14:07:36 +08:00 committed by GitHub
parent c1535ccbba
commit 0e484e6201
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 27 additions and 27 deletions

View File

@ -13,7 +13,7 @@ from .nvme_optimizer import NVMeOptimizer
class CPUAdam(NVMeOptimizer):
"""Implements Adam algorithm.
Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
Supports parameters updating on both GPU and CPU, depending on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.

View File

@ -14,7 +14,7 @@ from .cpu_adam import CPUAdam
class HybridAdam(CPUAdam):
"""Implements Adam algorithm.
Supports parameters updating on both GPU and CPU, depanding on the device of parameters.
Supports parameters updating on both GPU and CPU, depending on the device of parameters.
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.

View File

@ -83,7 +83,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
for k, v in kwargs.items():
if isinstance(v, torch.nn.Module):
v = self._layer_spec_dict[id(v)]
# (lyl)TODO: analyse ColoTensor as well
# (lyl)TODO: analyze ColoTensor as well
modified_kwargs[k] = v
# keep track of the module children
@ -117,7 +117,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
def to_layer_list(self, exec_seq=None):
"""
Create a layer spec list and func list with execution sequence given by user.
If exec_seq is None, we will take the module initizing order as execution order.
If exec_seq is None, we will take the module initializing order as execution order.
"""
self._exec_seq = exec_seq
@ -177,7 +177,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
def partition(self, num_chunks, pipeline_size, rank):
"""
Partitioned model will be built respect to partion policy.
Partitioned model will be built respect to partition policy.
The real module instance will be built in this method.
"""
if isinstance(self._policy, str):
@ -193,7 +193,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
self.customized_parts = customized_partition(self._exec_seq)
assert len(self.customized_parts) == gpc.get_world_size(
ParallelMode.PIPELINE
), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partions is {len(self.customized_parts)}'
), f'World size is {gpc.get_world_size(ParallelMode.PIPELINE)}, but the number of partitions is {len(self.customized_parts)}'
parts = self.customized_parts[rank]
else:
raise ValueError("A string partition policy should be one of ['uniform', 'balanced', 'customized'].")

View File

@ -123,7 +123,7 @@ class WorkerBase(ABC):
self.device = device
self._initialize_outstanding_range()
# variable and const for context managment
# variable and const for context management
self.outstanding = 0
self.forward_times = 0
self.backward_times = 0
@ -226,7 +226,7 @@ class WorkerBase(ABC):
self.pp_rank_to_worker_rref = pp_rank_to_worker_rref
# for some schedule need the other worker's info to initialise partition (like Chimera)
# construction of partition is executed after the registion of pp_rank_to_worker_rref
# construction of partition is executed after the registration of pp_rank_to_worker_rref
self._initialize_partition()
# res_use works for lifecycle counter,
@ -418,7 +418,7 @@ class WorkerBase(ABC):
# On current PP middleware design for DAG, get_output_by_key used by _subscribe_producer
# can only be executed once for every producer-consumer stage pair, which is necessary
# to count the lifecycle of work_item. So, keeping the _subscribe_producer in the same
# lock of work_item queue operation gurantees the consistency of lifecycle counter.
# lock of work_item queue operation guarantees the consistency of lifecycle counter.
work_item_from_producer = self._subscribe_producer(microbatch_id, forward_only)
self.work_list[key] = work_item_from_producer
self.work_list_condition_lock.notify_all()
@ -460,7 +460,7 @@ class WorkerBase(ABC):
# On current PP middleware design for DAG, get_output_by_key used by subscribe_consumer
# can only be executed once for every producer-consumer stage pair, which is necessary
# to count the lifecycle of work_item. So, keeping the subscribe_consumer in the same
# lock of work_item queue operation gurantees the consistency of lifecycle counter.
# lock of work_item queue operation guarantees the consistency of lifecycle counter.
work_item_from_consumer = self._subscribe_consumer(microbatch_id)
self.work_list[key] = work_item_from_consumer
self.work_list_condition_lock.notify_all()
@ -508,7 +508,7 @@ class WorkerBase(ABC):
assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
# should be aranged in order, the order of the input of current forward
# should be arranged in order, the order of the input of current forward
self.producer_stage_ids = self.get_producer_stage_ids()
self.consumer_stage_ids = self.get_consumer_stage_ids()

View File

@ -123,7 +123,7 @@ class ChimeraWorker(WorkerBase):
assert self.producer_stage_ids is None, f"all the producers of rank {rank} has been subscribed"
assert self.consumer_stage_ids is None, f"all the consumers of rank {rank} has been subscribed"
# should be aranged in order, the order of the input of current forward
# should be arranged in order, the order of the input of current forward
self.producer_stage_ids = []
self.consumer_stage_ids = []
@ -174,7 +174,7 @@ class ChimeraWorker(WorkerBase):
else:
# if it is down pipeline, create partition by origin method
co_up_pp_worker_rref = self.pp_rank_to_worker_rref[pp_rank - stage_num]
# get the coresponding model state dict and wait for its init
# get the corresponding model state dict and wait for its init
state_dict = co_up_pp_worker_rref.rpc_sync().get_partition_state_dict()
super()._initialize_partition()
self.module_partition.load_state_dict(state_dict)
@ -228,7 +228,7 @@ class ChimeraWorker(WorkerBase):
stage_num = self.actual_stage_num
co_pp_rank = (pp_rank + stage_num) % (2 * stage_num)
# if currrent pp_rank is not the first to do step
# if current pp_rank is not the first to do step
# wait its previous pp_rank finish step
grads = self.get_parameter_gradients()

View File

@ -113,7 +113,7 @@ def _binary_search(weights, num):
def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
assert num_items % num_chunks == 0, \
"Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
"Layer length should be divided by the number of chunks, otherwise parameter method is recommended"
logger = get_dist_logger()
parts = [[] for _ in range(pipeline_parallel_size)]

View File

@ -28,7 +28,7 @@ class CommSpec:
to determine the buffer shape, and logical_process_axis
Argument:
comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
process_groups_dict(Dict): A dict which contains the process groups used to apply this CommSpec.
gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
shard_dim(int, Optional): The shard_dim of the tensor will be sharded.

View File

@ -41,7 +41,7 @@ class DimSpec:
def _convert_str_to_shard_list(self, str_spec):
'''
Conver str_spec into shard_list.
Convert str_spec into shard_list.
Argument:
str_spec(str): dim spec in str type.
@ -58,7 +58,7 @@ class DimSpec:
def build_difference_2d_dict(self):
'''
Build a difference maping for 2D device mesh case. It will be used to
Build a difference mapping for 2D device mesh case. It will be used to
compute the difference between DimSpec pairs.
'''

View File

@ -164,7 +164,7 @@ def _get_grad_args(*args):
for obj in args:
if _is_grad_tensor(obj):
return args, None
# otherwise, the first arguement should be a tuple of grad tensors
# otherwise, the first argument should be a tuple of grad tensors
# if there is no grad tensor, the backward of PreFwdPostBwd can't be triggered
arg_zero = args[0]
if not isinstance(arg_zero, tuple):

View File

@ -130,7 +130,7 @@ class ProcessGroup:
@property
def has_cpu_groups(self) -> bool:
"""has_cpu_groups
If cpu groups have been initailized.
If cpu groups have been initialized.
Returns:
bool: cpu process groups have been initialized or not.

View File

@ -252,7 +252,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
def get_all_shard_spec(self, source_spec: ShardingSpec, orig_cost_dict):
'''
Get all valid sharding specs from source_spec with single shard operation, and
accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
accumulate communication cost on origin cost which will finally be used in auto sharding solver.
For the sharding operation, we just care about legal sharding dimensions.
Argument:
@ -386,7 +386,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
def get_all_one_step_transform_spec(self, source_spec: ShardingSpec, orig_cost_dict) -> Dict[ShardingSpec, float]:
'''
Get all valid sharding specs from source_spec with one step transform, and
accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
accumulate communication cost on origin cost which will finally be used in auto sharding solver.
Note:
all-gather will eliminate a sharding dimension, all-to-all will keep sharding dimension same as before,
and shard will add a sharding dimension. Therefore, the result of above operations are mutual exclusive,
@ -577,7 +577,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
Step3:
Repeat above steps until the source spec transform to target spec.
During finding the transform path, commucation cost will be accumulated, and it
During finding the transform path, communication cost will be accumulated, and it
will be finally used in auto parallel solver.
Additionally, to avoid repeating the path search in runtime, we cached all solved path

View File

@ -45,7 +45,7 @@ class _DimSpec:
def _convert_str_to_shard_list(self, str_spec):
'''
Conver str_spec into shard_list.
Convert str_spec into shard_list.
Argument:
str_spec(str): dim spec in str type.
@ -62,7 +62,7 @@ class _DimSpec:
def build_difference_2d_dict(self):
'''
Build a difference maping for 2D device mesh case. It will be used to
Build a difference mapping for 2D device mesh case. It will be used to
compute the difference between DimSpec pairs.
'''
@ -166,7 +166,7 @@ class ShardingSpec:
device_mesh(DeviceMesh): A logical view of a physical mesh.
entire_shape(torch.Size): The entire shape of tensor before sharded.
dim_partition_dict(Dict[int, List[int]] optional): The key is the dimension of tensor to be sharded,
and the value of the key decribe which logical axis will be sharded in that dimension.
and the value of the key describe which logical axis will be sharded in that dimension.
sharding_sequence(List[_DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
'''

View File

@ -77,7 +77,7 @@ def shard_simulator(target_pair, legal_sharding_dims):
Argument:
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
and the second element decribes which logical axis will be sharded in that dimension.
and the second element describes which logical axis will be sharded in that dimension.
'''
_, shard_list = target_pair
shard_list_list = []