ColossalAI/colossalai/auto_parallel/tensor_shard/utils/factory.py

import copy
import operator
import warnings
from functools import reduce
from typing import Dict, List, Optional, Union

import torch
from torch.fx.node import Node
from torch.utils._pytree import tree_map

from colossalai.device.device_mesh import DeviceMesh
from colossalai.tensor.shape_consistency import ShapeConsistencyManager
from colossalai.tensor.sharding_spec import ShardingSpec

from ..constants import INFINITY_COST

__all__ = ['generate_sharding_spec', 'generate_resharding_costs']


def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,
                           dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
    """
    Generate the sharding spec of the tensor based on the given dim_partition_dict.


    Args:
        input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
        device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.
        dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding.
    """

    if isinstance(input_, Node):
        assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'
        meta_tensor = input_._meta_data
        assert meta_tensor is not None, "The given node's _meta_data attribute is None"
        shape = meta_tensor.shape
    elif isinstance(input_, torch.Tensor):
        shape = input_.shape
    else:
        raise TypeError(
            f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'
        )
    for dim_index, sharding_index_list in dim_partition_dict.items():
        sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
        sharding_size = reduce(operator.mul, sharding_list, 1)
        assert shape[
            dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'

    sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)
    return sharding_spec


def generate_resharding_costs(nodes: List[Node],
                              sharding_specs: List[ShardingSpec],
                              count_backward: Optional[bool] = True,
                              dtype: Optional[torch.dtype] = None,
                              index=None):
    '''
    Compute the resharding costs with this specific strategy.

    Argument:
        nodes (List[Node]): a list of nodes
        sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
        count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
    '''
    # The resharding_cost of weight is counted due to sharing weight cases.
    resharding_costs = {}
    size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()

    # shape consistency manager is a singleton class
    shape_consistency_manager = ShapeConsistencyManager()

    for input_node, input_spec in zip(nodes, sharding_specs):
        resharding_costs[input_node] = []
        for strategy in input_node.strategies_vector:
            input_sharding_spec = strategy.output_sharding_spec
            if not isinstance(input_sharding_spec, ShardingSpec):
                assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'
                input_sharding_spec = input_sharding_spec[index]
            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
            try:
                # compute the resharding cost
                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
                    input_sharding_spec, input_spec)

                # we need multiply the size of elem dtype to get correct communication cost
                resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
            except AssertionError as e:
                warnings.warn(f'{e}')
                resharding_cost = INFINITY_COST
            resharding_costs[input_node].append(resharding_cost)
    return resharding_costs


def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_length_threshold: int = 20):
    '''
    Find the largest repeat blocks in the graph, whose length is larger than the threshold.

    Args:
        gm (GraphModule): the graph module to be analyzed.
        common_length_threshold (int): the threshold of the repeat block length.
    '''

    # graph = gm.graph

    def _process_args(args):
        new_args = []
        for arg in args:
            if hasattr(arg, '_meta_data'):
                meta_data = arg._meta_data
            else:
                meta_data = arg

            def _process_arg(data):
                if isinstance(data, torch.Tensor):
                    data = data.size()
                elif isinstance(data, slice):
                    data = (data.start, data.step, data.stop)
                return data

            new_meta_data = tree_map(_process_arg, meta_data)
            new_args.append(new_meta_data)

        return new_args

    def _all_equal(check_list, check_fn):
        base_value = check_list[-1]
        for e in check_list:
            if not check_fn(e, base_value):
                return False
        return True

    def _check_node_list_equal(l1, l2):
        if len(l1) != len(l2):
            return False
        for node1, node2 in zip(l1, l2):
            if hash(node1.hash_key) != hash(node2.hash_key):
                return False
        return True

    def _check_node_equal(node1, node2):
        if hash(node1.hash_key) == hash(node2.hash_key):
            return True
        return False

    for index, node in enumerate(node_list):
        if node.op == 'call_module':
            target = node.target
            submod = root_module.get_submodule(target)
            submod_type = type(submod)
            target = submod_type
        else:
            target = node.target

        new_args = _process_args(node.args)

        if node.op != 'get_attr':
            hash_key = (node.op, target, *new_args)
        else:
            hash_key = (node.op,)

        setattr(node, 'hash_key', hash_key)

    hash_value_to_node_dict = {}

    for index, node in enumerate(node_list):
        hash_value = hash(node.hash_key)
        if hash_value not in hash_value_to_node_dict:
            hash_value_to_node_dict[hash_value] = []
        hash_value_to_node_dict[hash_value].append(index)

    # node_list = list(graph.nodes)

    node_list_start = 0
    max_common_length = common_length_threshold
    common_blocks_index = []
    for index, node in enumerate(node_list):
        # the comparison will be triggered if a common node appears
        if len(hash_value_to_node_dict[hash(node.hash_key)]) >= 2:
            start_index_list = hash_value_to_node_dict[hash(node.hash_key)]
            check_block_list = [node_list[start:start + max_common_length] for start in start_index_list]

            common_label = True
            if not _all_equal(check_block_list, _check_node_list_equal):
                common_label = False

            if common_label:
                common_blocks_index = copy.deepcopy(start_index_list)
                max_step = len(node_list) - common_blocks_index[-1] - max_common_length - 1

                for i in range(max_step):
                    # add assertion to avoid out of index
                    next_node_list = [node_list[index + max_common_length + i] for index in start_index_list]
                    if not _all_equal(next_node_list, _check_node_equal):
                        max_step = i
                        break
                max_common_length += max_step
                node_list_start += max_common_length

    # recover common subgraph from the index
    common_blocks = []
    for start in common_blocks_index:
        common_blocks.append(node_list[start:start + max_common_length])

    return common_blocks
[autoparallel] find repeat blocks (#2854) * [autoparallel] find repeat blocks * polish * polish * polish 2 years ago			`import copy`
[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago			`import operator`
[autoparallel] add bcast op handler (#1600) * [autoparallel] add bcast op handler * polish code * add more BCAST FUNC OP * polish code * add exception handler * polish 2 years ago			`import warnings`
			`from functools import reduce`
[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago			`from typing import Dict, List, Optional, Union`

			`import torch`
[autoparallel] find repeat blocks (#2854) * [autoparallel] find repeat blocks * polish * polish * polish 2 years ago			`from torch.fx.node import Node`
			`from torch.utils._pytree import tree_map`

[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago			`from colossalai.device.device_mesh import DeviceMesh`
			`from colossalai.tensor.shape_consistency import ShapeConsistencyManager`
			`from colossalai.tensor.sharding_spec import ShardingSpec`

			`from ..constants import INFINITY_COST`

			`__all__ = ['generate_sharding_spec', 'generate_resharding_costs']`
[autoparallel] added generate_sharding_spec to utils (#1590) 2 years ago

			`def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,`
			`dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:`
			`"""`
			`Generate the sharding spec of the tensor based on the given dim_partition_dict.`
[autoparallel] find repeat blocks (#2854) * [autoparallel] find repeat blocks * polish * polish * polish 2 years ago
[autoparallel] added generate_sharding_spec to utils (#1590) 2 years ago
			`Args:`
			`input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.`
			`device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.`
			`dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding.`
			`"""`

			`if isinstance(input_, Node):`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'`
[autoparallel] added generate_sharding_spec to utils (#1590) 2 years ago			`meta_tensor = input_._meta_data`
			`assert meta_tensor is not None, "The given node's _meta_data attribute is None"`
			`shape = meta_tensor.shape`
			`elif isinstance(input_, torch.Tensor):`
			`shape = input_.shape`
			`else:`
			`raise TypeError(`
			`f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'`
			`)`
[autoparallel] add bcast op handler (#1600) * [autoparallel] add bcast op handler * polish code * add more BCAST FUNC OP * polish code * add exception handler * polish 2 years ago			`for dim_index, sharding_index_list in dim_partition_dict.items():`
			`sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]`
			`sharding_size = reduce(operator.mul, sharding_list, 1)`
			`assert shape[`
			`dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'`
[autoparallel] added generate_sharding_spec to utils (#1590) 2 years ago
			`sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)`
			`return sharding_spec`
[autoparallel] refactored shape consistency to remove redundancy (#1591) * [autoparallel] refactored shape consistency to remove redundancy * polish code * polish code * polish code 2 years ago

			`def generate_resharding_costs(nodes: List[Node],`
			`sharding_specs: List[ShardingSpec],`
			`count_backward: Optional[bool] = True,`
[autoparallel] adapt solver with gpt (#1653) 2 years ago			`dtype: Optional[torch.dtype] = None,`
			`index=None):`
[autoparallel] refactored shape consistency to remove redundancy (#1591) * [autoparallel] refactored shape consistency to remove redundancy * polish code * polish code * polish code 2 years ago			`'''`
			`Compute the resharding costs with this specific strategy.`

			`Argument:`
			`nodes (List[Node]): a list of nodes`
			`sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.`
			`count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.`
[autoparallel] find repeat blocks (#2854) * [autoparallel] find repeat blocks * polish * polish * polish 2 years ago			`dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.`
[autoparallel] refactored shape consistency to remove redundancy (#1591) * [autoparallel] refactored shape consistency to remove redundancy * polish code * polish code * polish code 2 years ago			`'''`
			`# The resharding_cost of weight is counted due to sharing weight cases.`
			`resharding_costs = {}`
			`size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()`

			`# shape consistency manager is a singleton class`
			`shape_consistency_manager = ShapeConsistencyManager()`

			`for input_node, input_spec in zip(nodes, sharding_specs):`
			`resharding_costs[input_node] = []`
			`for strategy in input_node.strategies_vector:`
			`input_sharding_spec = strategy.output_sharding_spec`
[autoparallel] adapt solver with gpt (#1653) 2 years ago			`if not isinstance(input_sharding_spec, ShardingSpec):`
			`assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'`
			`input_sharding_spec = input_sharding_spec[index]`
[autoparallel] refactored shape consistency to remove redundancy (#1591) * [autoparallel] refactored shape consistency to remove redundancy * polish code * polish code * polish code 2 years ago			`assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'`
[tensor] use communication autograd func (#1617) * [tensor] use communication autograd func * change all to all comm spec info * rename pattern and distinguish fwd/bwd * polish code 2 years ago			`try:`
			`# compute the resharding cost`
			`_, _, total_resharding_cost = shape_consistency_manager.shape_consistency(`
			`input_sharding_spec, input_spec)`

			`# we need multiply the size of elem dtype to get correct communication cost`
[autoparallel] update CommSpec (#1667) 2 years ago			`resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes`
[tensor] use communication autograd func (#1617) * [tensor] use communication autograd func * change all to all comm spec info * rename pattern and distinguish fwd/bwd * polish code 2 years ago			`except AssertionError as e:`
			`warnings.warn(f'{e}')`
			`resharding_cost = INFINITY_COST`
[autoparallel] refactored shape consistency to remove redundancy (#1591) * [autoparallel] refactored shape consistency to remove redundancy * polish code * polish code * polish code 2 years ago			`resharding_costs[input_node].append(resharding_cost)`
			`return resharding_costs`
[autoparallel] find repeat blocks (#2854) * [autoparallel] find repeat blocks * polish * polish * polish 2 years ago

			`def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_length_threshold: int = 20):`
			`'''`
			`Find the largest repeat blocks in the graph, whose length is larger than the threshold.`

			`Args:`
			`gm (GraphModule): the graph module to be analyzed.`
			`common_length_threshold (int): the threshold of the repeat block length.`
			`'''`

			`# graph = gm.graph`

			`def _process_args(args):`
			`new_args = []`
			`for arg in args:`
			`if hasattr(arg, '_meta_data'):`
			`meta_data = arg._meta_data`
			`else:`
			`meta_data = arg`

			`def _process_arg(data):`
			`if isinstance(data, torch.Tensor):`
			`data = data.size()`
			`elif isinstance(data, slice):`
			`data = (data.start, data.step, data.stop)`
			`return data`

			`new_meta_data = tree_map(_process_arg, meta_data)`
			`new_args.append(new_meta_data)`

			`return new_args`

			`def _all_equal(check_list, check_fn):`
			`base_value = check_list[-1]`
			`for e in check_list:`
			`if not check_fn(e, base_value):`
			`return False`
			`return True`

			`def _check_node_list_equal(l1, l2):`
			`if len(l1) != len(l2):`
			`return False`
			`for node1, node2 in zip(l1, l2):`
			`if hash(node1.hash_key) != hash(node2.hash_key):`
			`return False`
			`return True`

			`def _check_node_equal(node1, node2):`
			`if hash(node1.hash_key) == hash(node2.hash_key):`
			`return True`
			`return False`

			`for index, node in enumerate(node_list):`
			`if node.op == 'call_module':`
			`target = node.target`
			`submod = root_module.get_submodule(target)`
			`submod_type = type(submod)`
			`target = submod_type`
			`else:`
			`target = node.target`

			`new_args = _process_args(node.args)`

			`if node.op != 'get_attr':`
			`hash_key = (node.op, target, *new_args)`
			`else:`
			`hash_key = (node.op,)`

			`setattr(node, 'hash_key', hash_key)`

			`hash_value_to_node_dict = {}`

			`for index, node in enumerate(node_list):`
			`hash_value = hash(node.hash_key)`
			`if hash_value not in hash_value_to_node_dict:`
			`hash_value_to_node_dict[hash_value] = []`
			`hash_value_to_node_dict[hash_value].append(index)`

			`# node_list = list(graph.nodes)`

			`node_list_start = 0`
			`max_common_length = common_length_threshold`
			`common_blocks_index = []`
			`for index, node in enumerate(node_list):`
			`# the comparison will be triggered if a common node appears`
			`if len(hash_value_to_node_dict[hash(node.hash_key)]) >= 2:`
			`start_index_list = hash_value_to_node_dict[hash(node.hash_key)]`
			`check_block_list = [node_list[start:start + max_common_length] for start in start_index_list]`

			`common_label = True`
			`if not _all_equal(check_block_list, _check_node_list_equal):`
			`common_label = False`

			`if common_label:`
			`common_blocks_index = copy.deepcopy(start_index_list)`
			`max_step = len(node_list) - common_blocks_index[-1] - max_common_length - 1`

			`for i in range(max_step):`
			`# add assertion to avoid out of index`
			`next_node_list = [node_list[index + max_common_length + i] for index in start_index_list]`
			`if not _all_equal(next_node_list, _check_node_equal):`
			`max_step = i`
			`break`
			`max_common_length += max_step`
			`node_list_start += max_common_length`

			`# recover common subgraph from the index`
			`common_blocks = []`
			`for start in common_blocks_index:`
			`common_blocks.append(node_list[start:start + max_common_length])`

			`return common_blocks`