mirror of https://github.com/hpcaitech/ColossalAI
fix typo colossalai/auto_parallel autochunk fx/passes etc. (#3808)
parent
725365f297
commit
7f8203af69
|
@ -14,7 +14,7 @@
|
|||
- [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
|
||||
- [Release](#release)
|
||||
- [User Friendliness](#user-friendliness)
|
||||
- [Commmunity](#commmunity)
|
||||
- [Community](#community)
|
||||
- [Configuration](#configuration)
|
||||
- [Progress Log](#progress-log)
|
||||
|
||||
|
@ -97,7 +97,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll
|
|||
| `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
|
||||
| `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. |
|
||||
|
||||
### Commmunity
|
||||
### Community
|
||||
|
||||
| Workflow Name | File name | Description |
|
||||
| -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
|
||||
|
|
|
@ -148,7 +148,7 @@ class MetaInfoProp:
|
|||
graph_info.fwd_tmp = buffer_tensors
|
||||
graph_info.fwd_out = output_tensors
|
||||
|
||||
# fetch other memory informations
|
||||
# fetch other memory information
|
||||
memory_cost = meta_info.memory_cost
|
||||
graph_info.fwd_mem_tmp = memory_cost.fwd.temp
|
||||
graph_info.fwd_mem_out = memory_cost.fwd.activation
|
||||
|
|
|
@ -44,7 +44,7 @@ class BatchNormStrategyGenerator(StrategyGenerator):
|
|||
'''
|
||||
Compute the computation cost per device with this specific strategy.
|
||||
|
||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
'''
|
||||
# TODO: a constant coefficient need to be added.
|
||||
# 1D: (L) * N * Cin
|
||||
|
|
|
@ -38,9 +38,9 @@ class ConvStrategyGenerator(StrategyGenerator):
|
|||
'''
|
||||
Compute the computation cost per device with this specific strategy.
|
||||
|
||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
'''
|
||||
# TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
# TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
# 1D: (L) * N * Cout * Cin * kernel
|
||||
# 2D: (H * W) * N * Cout * Cin * kernel
|
||||
# 3D: (H * W * D) * N * Cout * Cin * kernel
|
||||
|
|
|
@ -34,9 +34,9 @@ class LayerNormGenerator(StrategyGenerator):
|
|||
'''
|
||||
Compute the computation cost per device with this specific strategy.
|
||||
|
||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
'''
|
||||
# TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
# TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
# TODO: a constant coefficient need to be added.
|
||||
|
||||
sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
|
||||
|
|
|
@ -17,7 +17,7 @@ class NormalPoolStrategyGenerator(StrategyGenerator):
|
|||
"""
|
||||
NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd.
|
||||
The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image,
|
||||
and reduce them depening on the operation type.
|
||||
and reduce them depending on the operation type.
|
||||
"""
|
||||
|
||||
def validate(self) -> bool:
|
||||
|
@ -35,9 +35,9 @@ class NormalPoolStrategyGenerator(StrategyGenerator):
|
|||
'''
|
||||
Compute the computation cost per device with this specific strategy.
|
||||
|
||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
'''
|
||||
# TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
||||
# TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||
# 1D: (Lout) * N * C * kernel
|
||||
# 2D: (H * W) * N * Cout * Cin * kernel
|
||||
# 3D: (H * W * D) * N * Cout * Cin * kernel
|
||||
|
|
|
@ -366,8 +366,8 @@ class TraceFlow(object):
|
|||
# find non chunk inputs
|
||||
chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
|
||||
|
||||
# reassgin reshape size, some size may have changed due to chunk
|
||||
chunk_info = self._reassgin_reshape_size(chunk_info)
|
||||
# reassign reshape size, some size may have changed due to chunk
|
||||
chunk_info = self._reassign_reshape_size(chunk_info)
|
||||
|
||||
return chunk_info
|
||||
|
||||
|
@ -428,10 +428,10 @@ class TraceFlow(object):
|
|||
chunk_info["outputs_dim"].append(output_dim)
|
||||
return True
|
||||
|
||||
def _reassgin_reshape_size(self, chunk_info):
|
||||
def _reassign_reshape_size(self, chunk_info):
|
||||
"""
|
||||
Some shape args in reshape may have changed due to chunk
|
||||
reassgin those changed shape
|
||||
reassign those changed shape
|
||||
"""
|
||||
chunk_region = chunk_info["region"]
|
||||
reshape_size = {}
|
||||
|
|
|
@ -397,7 +397,7 @@ class TraceIndice(object):
|
|||
input_node = node.args[0]
|
||||
assert len(get_node_shape(input_node)) == 4
|
||||
|
||||
# assgin index
|
||||
# assign index
|
||||
self._assign_indice_as_input(node, node_idx, input_node)
|
||||
self._del_dim(node_idx, 1)
|
||||
self._add_dim(node_idx, 1)
|
||||
|
@ -415,7 +415,7 @@ class TraceIndice(object):
|
|||
assert node.kwargs['size'] is None
|
||||
assert len(get_node_shape(node)) == 4
|
||||
|
||||
# assgin index
|
||||
# assign index
|
||||
self._assign_indice_as_input(node, node_idx)
|
||||
self._mark_computation(node, node_idx, [-1, -2])
|
||||
|
||||
|
|
|
@ -179,7 +179,7 @@ class GeminiPlugin(DPPluginBase):
|
|||
Users can provide this argument to speed up searching.
|
||||
If users do not know this argument before training, it is ok. We will use a default value 1024.
|
||||
min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
|
||||
If the aggregate size of parameters is still samller than the minimum chunk size,
|
||||
If the aggregate size of parameters is still smaller than the minimum chunk size,
|
||||
all parameters will be compacted into one small chunk.
|
||||
memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
|
||||
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
||||
|
|
|
@ -181,7 +181,7 @@ class DistCoordinator(metaclass=SingletonMeta):
|
|||
"""
|
||||
is_master = self.is_master(process_group)
|
||||
|
||||
# define an inner functiuon
|
||||
# define an inner function
|
||||
def decorator(func):
|
||||
|
||||
@functools.wraps(func)
|
||||
|
|
|
@ -381,7 +381,7 @@ class AlphaBetaProfiler:
|
|||
first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
|
||||
second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
|
||||
mesh_alpha = [first_latency, second_latency]
|
||||
# The beta values have been enlarged by 1e10 times temporarilly because the computation cost
|
||||
# The beta values have been enlarged by 1e10 times temporarily because the computation cost
|
||||
# is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
|
||||
mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
|
||||
|
||||
|
|
|
@ -152,9 +152,9 @@ class PipelineSchedule(BaseSchedule):
|
|||
raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
|
||||
|
||||
def load_micro_batch(self):
|
||||
mciro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
|
||||
micro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
|
||||
self.microbatch_offset += self.microbatch_size
|
||||
return self._move_to_device(mciro_batch_data)
|
||||
return self._move_to_device(micro_batch_data)
|
||||
|
||||
def pre_processing(self, engine):
|
||||
from colossalai.zero.legacy import ShardedModelV2
|
||||
|
|
|
@ -84,7 +84,7 @@ class PipelineScheduleV2(PipelineSchedule):
|
|||
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
|
||||
self.load_batch(data_iter)
|
||||
|
||||
# num_warmup_microbatches is the step when not all the processers are working
|
||||
# num_warmup_microbatches is the step when not all the processes are working
|
||||
num_warmup_microbatches = \
|
||||
(gpc.get_world_size(ParallelMode.PIPELINE)
|
||||
- gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
|
||||
|
|
|
@ -523,7 +523,7 @@ def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func,
|
|||
# append code text to body
|
||||
for idx, node in enumerate(node_list):
|
||||
# if this is the first node of the ckpt region
|
||||
# append the ckpt function defition
|
||||
# append the ckpt function definition
|
||||
if idx in start_idx:
|
||||
label = start_idx.index(idx)
|
||||
ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
|
||||
|
|
|
@ -206,7 +206,7 @@ def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
|
|||
|
||||
def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
|
||||
"""
|
||||
In avgnode_split_pass, simpliy split graph by node number.
|
||||
In avgnode_split_pass, simply split graph by node number.
|
||||
"""
|
||||
mod_graph = gm.graph
|
||||
avg_num_node = len(mod_graph.nodes) // pp_size
|
||||
|
|
|
@ -16,7 +16,7 @@ def apply(*args, **kwargs):
|
|||
return shape_consistency_manager.apply(*args, **kwargs)
|
||||
|
||||
|
||||
def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
|
||||
def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
|
||||
mod_graph = gm.graph
|
||||
nodes = tuple(mod_graph.nodes)
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class TensorMetadata(NamedTuple):
|
|||
numel: int
|
||||
is_tensor: bool
|
||||
# TODO: we can add a list of sharding spec here, and record the sharding
|
||||
# behaviour by appending sharding spec into list.
|
||||
# behavior by appending sharding spec into list.
|
||||
|
||||
|
||||
def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata:
|
||||
|
|
|
@ -230,7 +230,7 @@ def split_module_for_gpt2_test(
|
|||
use_partition.partitions_dependent_on.setdefault(def_partition_name)
|
||||
|
||||
node_process_list = list(m.graph.nodes)
|
||||
# split nodes into parititons
|
||||
# split nodes into partitions
|
||||
while node_process_list:
|
||||
node = node_process_list.pop(0)
|
||||
orig_nodes[node.name] = node
|
||||
|
@ -277,7 +277,7 @@ def split_module_for_gpt2_test(
|
|||
if len(sorted_partitions) != len(partitions):
|
||||
raise RuntimeError("cycle exists between partitions!")
|
||||
|
||||
# add placeholders to parititons
|
||||
# add placeholders to partitions
|
||||
for partition_name in sorted_partitions:
|
||||
partition = partitions[partition_name]
|
||||
for input in partition.inputs:
|
||||
|
|
|
@ -29,8 +29,8 @@ class Partition:
|
|||
f" nodes: {self.node_names},\n" \
|
||||
f" inputs: {self.inputs},\n" \
|
||||
f" outputs: {self.outputs},\n" \
|
||||
f" partitions depenent on: {self.partitions_dependent_on},\n" \
|
||||
f" parition dependents: {self.partition_dependents}"
|
||||
f" partitions dependent on: {self.partitions_dependent_on},\n" \
|
||||
f" partition dependents: {self.partition_dependents}"
|
||||
|
||||
|
||||
# Creates subgraphs out of main graph
|
||||
|
|
Loading…
Reference in New Issue