import copy import operator import warnings from functools import reduce from typing import Dict, List, Optional, Union import torch from torch.fx.node import Node from torch.utils._pytree import tree_map from colossalai.device.device_mesh import DeviceMesh from colossalai.tensor.shape_consistency import ShapeConsistencyManager from colossalai.tensor.sharding_spec import ShardingSpec from ..constants import INFINITY_COST __all__ = ['generate_sharding_spec', 'generate_resharding_costs'] def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh, dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec: """ Generate the sharding spec of the tensor based on the given dim_partition_dict. Args: input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node. device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster. dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding. """ if isinstance(input_, Node): assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data' meta_tensor = input_._meta_data assert meta_tensor is not None, "The given node's _meta_data attribute is None" shape = meta_tensor.shape elif isinstance(input_, torch.Tensor): shape = input_.shape else: raise TypeError( f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.' ) for dim_index, sharding_index_list in dim_partition_dict.items(): sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list] sharding_size = reduce(operator.mul, sharding_list, 1) assert shape[ dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.' sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict) return sharding_spec def generate_resharding_costs(nodes: List[Node], sharding_specs: List[ShardingSpec], count_backward: Optional[bool] = True, dtype: Optional[torch.dtype] = None, index=None): ''' Compute the resharding costs with this specific strategy. Argument: nodes (List[Node]): a list of nodes sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes. count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference. dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. ''' # The resharding_cost of weight is counted due to sharing weight cases. resharding_costs = {} size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size() # shape consistency manager is a singleton class shape_consistency_manager = ShapeConsistencyManager() for input_node, input_spec in zip(nodes, sharding_specs): resharding_costs[input_node] = [] for strategy in input_node.strategies_vector: input_sharding_spec = strategy.output_sharding_spec if not isinstance(input_sharding_spec, ShardingSpec): assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.' input_sharding_spec = input_sharding_spec[index] assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.' try: # compute the resharding cost _, _, total_resharding_cost = shape_consistency_manager.shape_consistency( input_sharding_spec, input_spec) # we need multiply the size of elem dtype to get correct communication cost resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes except AssertionError as e: warnings.warn(f'{e}') resharding_cost = INFINITY_COST resharding_costs[input_node].append(resharding_cost) return resharding_costs def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_length_threshold: int = 20): ''' Find the largest repeat blocks in the graph, whose length is larger than the threshold. Args: gm (GraphModule): the graph module to be analyzed. common_length_threshold (int): the threshold of the repeat block length. ''' # graph = gm.graph def _process_args(args): new_args = [] for arg in args: if hasattr(arg, '_meta_data'): meta_data = arg._meta_data else: meta_data = arg def _process_arg(data): if isinstance(data, torch.Tensor): data = data.size() elif isinstance(data, slice): data = (data.start, data.step, data.stop) return data new_meta_data = tree_map(_process_arg, meta_data) new_args.append(new_meta_data) return new_args def _all_equal(check_list, check_fn): base_value = check_list[-1] for e in check_list: if not check_fn(e, base_value): return False return True def _check_node_list_equal(l1, l2): if len(l1) != len(l2): return False for node1, node2 in zip(l1, l2): if hash(node1.hash_key) != hash(node2.hash_key): return False return True def _check_node_equal(node1, node2): if hash(node1.hash_key) == hash(node2.hash_key): return True return False for index, node in enumerate(node_list): if node.op == 'call_module': target = node.target submod = root_module.get_submodule(target) submod_type = type(submod) target = submod_type else: target = node.target new_args = _process_args(node.args) if node.op != 'get_attr': hash_key = (node.op, target, *new_args) else: hash_key = (node.op,) setattr(node, 'hash_key', hash_key) hash_value_to_node_dict = {} for index, node in enumerate(node_list): hash_value = hash(node.hash_key) if hash_value not in hash_value_to_node_dict: hash_value_to_node_dict[hash_value] = [] hash_value_to_node_dict[hash_value].append(index) # node_list = list(graph.nodes) node_list_start = 0 max_common_length = common_length_threshold common_blocks_index = [] for index, node in enumerate(node_list): # the comparison will be triggered if a common node appears if len(hash_value_to_node_dict[hash(node.hash_key)]) >= 2: start_index_list = hash_value_to_node_dict[hash(node.hash_key)] check_block_list = [node_list[start:start + max_common_length] for start in start_index_list] common_label = True if not _all_equal(check_block_list, _check_node_list_equal): common_label = False if common_label: common_blocks_index = copy.deepcopy(start_index_list) max_step = len(node_list) - common_blocks_index[-1] - max_common_length - 1 for i in range(max_step): # add assertion to avoid out of index next_node_list = [node_list[index + max_common_length + i] for index in start_index_list] if not _all_equal(next_node_list, _check_node_equal): max_step = i break max_common_length += max_step node_list_start += max_common_length # recover common subgraph from the index common_blocks = [] for start in common_blocks_index: common_blocks.append(node_list[start:start + max_common_length]) return common_blocks