ColossalAI/colossalai/auto_parallel/tensor_shard/sharding_strategy.py

from copy import deepcopy
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Tuple, Union

import torch
from colossalai.tensor.shape_consistency import CommSpec
from colossalai.tensor.sharding_spec import ShardingSpec
from torch.fx.node import Node

from .constants import (BCAST_FUNC_OP, ELEMENTWISE_FUNC_OP, ELEMENTWISE_MODULE_OP, RESHAPE_FUNC_OP)

__all__ = ['OperationDataType', 'OperationData', 'TrainCycleItem', 'MemoryCost', 'ShardingStrategy', 'StrategiesVector']


class OperationDataType(Enum):
    """
    An operation can come from the argument list of an operator or the parameter list of a module.
    """
    INPUT = 0
    ARG = 1
    PARAM = 2
    BUFFER = 3
    OUTPUT = 4


@dataclass
class OperationData:
    """
    OperationData is the data related to an operator, the data can be the operand or the output.

    Args:
        name (str): the name of the operation-related data
        type (OperationDataType): the type of the operation data
        data (Any): the value for this data, usually it is a meta tensor.
        logical_shape (Tuple[int]): the logical shape of the data, it can be different from the its actual shape in memory.
    """
    name: str
    type: OperationDataType
    data: Any
    logical_shape: Tuple[int] = None

    def __post_init__(self):
        # if no logical shape is specified, use the data shape as the logical shape
        if self.logical_shape is None and isinstance(self.data, torch.Tensor):
            self.logical_shape = self.data.shape

    def __repr__(self) -> str:
        return f'OperationData(name={self.name}, type={self.type})'

    def __eq__(self, other) -> bool:
        return other.name == self.name

    def __hash__(self) -> int:
        return hash(f'{self.name}')


@dataclass
class TrainCycleItem:
    """
    TrainCycleItem is a dataclass to store the items which have different values for the forward and backward pass
    in a training iteration.

    Args:
        fwd (float): the item for the forward pass
        bwd (float): the item for the backward pass
    """
    fwd: Any
    bwd: Any
    total: Any


@dataclass
class MemoryCost:
    """
    MemoryCost is a dataclass which stores the memory usage in the program.

    Args:
        activation (int): the memory cost incurred by the activations in bytes.
        parameter (int): the memory cost incurred by the module parameter in bytes.
    """
    activation: int = 0
    parameter: int = 0
    buffer: int = 0


@dataclass
class ShardingStrategy:
    """
    ShardingStrategy is a dataclass to store the meta information on tensor sharding for a node.

    Args:
        name (str): express the sharding strategies in string, such as 'S0S1 = S0R x RS1'.
        output_sharding_spec (ShardingSpec): ShardingSpec of the output node.
        compute_cost (TrainCycleItem): Computation cost to complete this strategy. (default to None)
        communication_cost (TrainCycleItem): Communication cost to complete this strategy. (default to None)
        memory_cost (TrainCycleItem): Memory cost of the output node using this strategy. (default to None)
        input_sharding_specs (List(ShardingSpec)): The ShardingSpecs of the input nodes.
    """
    name: str
    sharding_specs: Dict[OperationData, Union[ShardingSpec, Tuple[ShardingSpec]]] = None
    compute_cost: TrainCycleItem = None
    communication_cost: TrainCycleItem = None
    memory_cost: TrainCycleItem = None
    communication_actions: Dict[OperationData, CommSpec] = None
    resharding_costs: Dict[Node, List[TrainCycleItem]] = None

    @property
    def input_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:
        specs = {}
        specs.update(self._get_sharding_spec(OperationDataType.ARG))
        specs.update(self._get_sharding_spec(OperationDataType.PARAM))
        return specs

    @property
    def argument_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:
        return self._get_sharding_spec(OperationDataType.ARG)

    @property
    def param_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:
        return self._get_sharding_spec(OperationDataType.PARAM)

    @property
    def output_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:
        return self._get_sharding_spec(OperationDataType.OUTPUT)

    def _get_sharding_spec(self, operation_data_type: OperationDataType):
        specs = {k: v for k, v in self.sharding_specs.items() if k.type == operation_data_type}
        return specs

    def get_op_data_by_name(self, name: str):
        for op_data in self.sharding_specs.keys():
            if op_data.name == name:
                return op_data
        raise KeyError(f"Could not find the OperationData with name {name}")

    def get_sharding_spec_by_name(self, name: str):
        for op_data, sharding_spec in self.sharding_specs.items():
            if op_data.name == name:
                return sharding_spec
        raise KeyError(f"Could not find the ShardingSpec for OperationData with name {name}")

    def clone(self):

        def _deepcopy_dict_vals(data: Dict):
            return {k: deepcopy(v) for k, v in data.items()}

        sharding_specs = _deepcopy_dict_vals(self.sharding_specs) if self.sharding_specs else None
        communication_actions = _deepcopy_dict_vals(self.communication_actions) if self.communication_actions else None
        resharding_costs = _deepcopy_dict_vals(self.resharding_costs) if self.resharding_costs else None
        compute_cost = deepcopy(self.compute_cost)
        communication_cost = deepcopy(self.communication_cost)
        memory_cost = deepcopy(self.memory_cost)

        return ShardingStrategy(name=self.name,
                                sharding_specs=sharding_specs,
                                compute_cost=compute_cost,
                                communication_cost=communication_cost,
                                memory_cost=memory_cost,
                                communication_actions=communication_actions,
                                resharding_costs=resharding_costs)


class StrategiesVector(list):
    '''
    Each node in fx graph will have a corresponding StrategiesVector, to store all the possible
    strategies of the node.

    Argument:
        node (Node): node for which the list of sharding strategies are generated.
    '''

    def __init__(self, node: Node):
        super().__init__()
        self.node = node
        # fetch its input and output nodes
        # TODO: placeholder input nodes
        self.predecessor_nodes = list(node._input_nodes.keys())
        if self.node.op == 'output':
            self.predecessor_nodes = list(node._input_nodes.keys())[:1]
        self.successor_nodes = list(node.users.keys())

    def check_merge(self):
        merge_label = False
        if self.node.op == 'call_module':
            target = self.node.target
            root_module = self.node.graph.owning_module
            submod = root_module.get_submodule(target)
            submod_type = type(submod)
            # merge elementwise module node into source nodes
            # we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.
            if submod_type in ELEMENTWISE_MODULE_OP:
                merge_label = True

        if self.node.op == 'call_function':
            # we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.
            if self.node.target in ELEMENTWISE_FUNC_OP:
                merge_label = True
            # we could merge bcast op if the rhs is a scalar, because it will fall back to the element-wise case.
            if self.node.target in BCAST_FUNC_OP and len(self.predecessor_nodes) == 1:
                merge_label = True
            # we could merge reshape op, because the output sharding spec of reshape op is always fully replicated.
            if self.node.target in RESHAPE_FUNC_OP:
                merge_label = True

        return merge_label
[autoparallel] added sharding spec conversion for linear handler (#1687) 2 years ago			`from copy import deepcopy`
[autoparallel] standardize the code structure (#1469) 2 years ago			`from dataclasses import dataclass`
[autoparallel] added new node handler (#1612) 2 years ago			`from enum import Enum`
[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago			`from typing import Any, Dict, List, Tuple, Union`
[autoparallel] added new linear module handler (#1616) 2 years ago
[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago			`import torch`
			`from colossalai.tensor.shape_consistency import CommSpec`
[autoparallel] standardize the code structure (#1469) 2 years ago			`from colossalai.tensor.sharding_spec import ShardingSpec`
[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago			`from torch.fx.node import Node`
[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago
			`from .constants import (BCAST_FUNC_OP, ELEMENTWISE_FUNC_OP, ELEMENTWISE_MODULE_OP, RESHAPE_FUNC_OP)`
[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago
[autoparallel] collated all deprecated files (#1700) * [autoparallel] collated all deprecated files * polish code 2 years ago			`__all__ = ['OperationDataType', 'OperationData', 'TrainCycleItem', 'MemoryCost', 'ShardingStrategy', 'StrategiesVector']`
[autoparallel] Add conv handler to generate strategies and costs info for conv (#1467) 2 years ago

[autoparallel] added new linear module handler (#1616) 2 years ago			`class OperationDataType(Enum):`
[autoparallel] added new node handler (#1612) 2 years ago			`"""`
[autoparallel] added new linear module handler (#1616) 2 years ago			`An operation can come from the argument list of an operator or the parameter list of a module.`
[autoparallel] added new node handler (#1612) 2 years ago			`"""`
[autoparallel] added new strategy constructor template (#1661) * [autoparallel] added new strategy constructor template * polish code 2 years ago			`INPUT = 0`
			`ARG = 1`
			`PARAM = 2`
[autoparallel] resnet block runtime apply (#1709) * [autoparallel] resnet block runtime apply * seperate buffer and parameter in MemoryCost * polish code * add comments and todos * fix test issue 2 years ago			`BUFFER = 3`
			`OUTPUT = 4`
[autoparallel] added new node handler (#1612) 2 years ago

			`@dataclass`
[autoparallel] added new linear module handler (#1616) 2 years ago			`class OperationData:`
			`"""`
			`OperationData is the data related to an operator, the data can be the operand or the output.`

			`Args:`
			`name (str): the name of the operation-related data`
			`type (OperationDataType): the type of the operation data`
[autoparallel] add following node generator (#1673) * [autoparallel] add following node generator * polish code * polish code * update name of arguments 2 years ago			`data (Any): the value for this data, usually it is a meta tensor.`
[autoparallel] added new linear module handler (#1616) 2 years ago			`logical_shape (Tuple[int]): the logical shape of the data, it can be different from the its actual shape in memory.`
			`"""`
[autoparallel] added new node handler (#1612) 2 years ago			`name: str`
[autoparallel] added new linear module handler (#1616) 2 years ago			`type: OperationDataType`
[autoparallel] add following node generator (#1673) * [autoparallel] add following node generator * polish code * polish code * update name of arguments 2 years ago			`data: Any`
[autoparallel] added new linear module handler (#1616) 2 years ago			`logical_shape: Tuple[int] = None`

			`def __post_init__(self):`
			`# if no logical shape is specified, use the data shape as the logical shape`
[autoparallel] add following node generator (#1673) * [autoparallel] add following node generator * polish code * polish code * update name of arguments 2 years ago			`if self.logical_shape is None and isinstance(self.data, torch.Tensor):`
[autoparallel] added new linear module handler (#1616) 2 years ago			`self.logical_shape = self.data.shape`
[autoparallel] added new node handler (#1612) 2 years ago
[autoparallel] implemented all matmul strategy generator (#1650) 2 years ago			`def __repr__(self) -> str:`
			`return f'OperationData(name={self.name}, type={self.type})'`

[autoparallel] add following node generator (#1673) * [autoparallel] add following node generator * polish code * polish code * update name of arguments 2 years ago			`def __eq__(self, other) -> bool:`
			`return other.name == self.name`

[autoparallel] implemented all matmul strategy generator (#1650) 2 years ago			`def __hash__(self) -> int:`
[autoparallel] add following node generator (#1673) * [autoparallel] add following node generator * polish code * polish code * update name of arguments 2 years ago			`return hash(f'{self.name}')`
[autoparallel] implemented all matmul strategy generator (#1650) 2 years ago
[autoparallel] added new node handler (#1612) 2 years ago
[autoparallel] refactored the data structure for sharding strategy (#1610) 2 years ago			`@dataclass`
			`class TrainCycleItem:`
			`"""`
			`TrainCycleItem is a dataclass to store the items which have different values for the forward and backward pass`
			`in a training iteration.`

			`Args:`
[autoparallel] added new node handler (#1612) 2 years ago			`fwd (float): the item for the forward pass`
			`bwd (float): the item for the backward pass`
[autoparallel] refactored the data structure for sharding strategy (#1610) 2 years ago			`"""`
			`fwd: Any`
			`bwd: Any`
			`total: Any`


[autoparallel] added new linear module handler (#1616) 2 years ago			`@dataclass`
[autoparallel] implemented linear projection strategy generator (#1639) 2 years ago			`class MemoryCost:`
[autoparallel] added new linear module handler (#1616) 2 years ago			`"""`
[autoparallel] refactored the autoparallel module for organization (#1706) * [autoparallel] refactored the autoparallel module for organization * polish code 2 years ago			`MemoryCost is a dataclass which stores the memory usage in the program.`

			`Args:`
			`activation (int): the memory cost incurred by the activations in bytes.`
			`parameter (int): the memory cost incurred by the module parameter in bytes.`
[autoparallel] added new linear module handler (#1616) 2 years ago			`"""`
[autoparallel] implemented linear projection strategy generator (#1639) 2 years ago			`activation: int = 0`
			`parameter: int = 0`
[autoparallel] resnet block runtime apply (#1709) * [autoparallel] resnet block runtime apply * seperate buffer and parameter in MemoryCost * polish code * add comments and todos * fix test issue 2 years ago			`buffer: int = 0`
[autoparallel] added new linear module handler (#1616) 2 years ago

[autoparallel] refactored the data structure for sharding strategy (#1610) 2 years ago			`@dataclass`
[autoparallel] collated all deprecated files (#1700) * [autoparallel] collated all deprecated files * polish code 2 years ago			`class ShardingStrategy:`
[autoparallel] refactored the data structure for sharding strategy (#1610) 2 years ago			`"""`
			`ShardingStrategy is a dataclass to store the meta information on tensor sharding for a node.`

			`Args:`
			`name (str): express the sharding strategies in string, such as 'S0S1 = S0R x RS1'.`
			`output_sharding_spec (ShardingSpec): ShardingSpec of the output node.`
			`compute_cost (TrainCycleItem): Computation cost to complete this strategy. (default to None)`
			`communication_cost (TrainCycleItem): Communication cost to complete this strategy. (default to None)`
			`memory_cost (TrainCycleItem): Memory cost of the output node using this strategy. (default to None)`
			`input_sharding_specs (List(ShardingSpec)): The ShardingSpecs of the input nodes.`
			`"""`
			`name: str`
[autoparallel] add following node generator (#1673) * [autoparallel] add following node generator * polish code * polish code * update name of arguments 2 years ago			`sharding_specs: Dict[OperationData, Union[ShardingSpec, Tuple[ShardingSpec]]] = None`
[autoparallel] refactored the data structure for sharding strategy (#1610) 2 years ago			`compute_cost: TrainCycleItem = None`
			`communication_cost: TrainCycleItem = None`
			`memory_cost: TrainCycleItem = None`
[autoparallel] implemented linear projection strategy generator (#1639) 2 years ago			`communication_actions: Dict[OperationData, CommSpec] = None`
[autoparallel] add output handler and placeholder handler (#1694) * [autoparallel] add output handler and placeholder handler * Delete test_solver_with_resnet.py * fix test bugs 2 years ago			`resharding_costs: Dict[Node, List[TrainCycleItem]] = None`
[autoparallel] added new linear module handler (#1616) 2 years ago
			`@property`
			`def input_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:`
			`specs = {}`
			`specs.update(self._get_sharding_spec(OperationDataType.ARG))`
			`specs.update(self._get_sharding_spec(OperationDataType.PARAM))`
			`return specs`

			`@property`
			`def argument_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:`
			`return self._get_sharding_spec(OperationDataType.ARG)`

			`@property`
			`def param_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:`
			`return self._get_sharding_spec(OperationDataType.PARAM)`

			`@property`
			`def output_sharding_specs(self) -> Dict[OperationData, ShardingSpec]:`
			`return self._get_sharding_spec(OperationDataType.OUTPUT)`

			`def _get_sharding_spec(self, operation_data_type: OperationDataType):`
			`specs = {k: v for k, v in self.sharding_specs.items() if k.type == operation_data_type}`
			`return specs`
[autoparallel] added new node handler (#1612) 2 years ago
[autoparallel] added compute resharding costs for node handler (#1662) 2 years ago			`def get_op_data_by_name(self, name: str):`
			`for op_data in self.sharding_specs.keys():`
			`if op_data.name == name:`
			`return op_data`
			`raise KeyError(f"Could not find the OperationData with name {name}")`

			`def get_sharding_spec_by_name(self, name: str):`
			`for op_data, sharding_spec in self.sharding_specs.items():`
			`if op_data.name == name:`
			`return sharding_spec`
			`raise KeyError(f"Could not find the ShardingSpec for OperationData with name {name}")`

[autoparallel] added sharding spec conversion for linear handler (#1687) 2 years ago			`def clone(self):`

			`def _deepcopy_dict_vals(data: Dict):`
			`return {k: deepcopy(v) for k, v in data.items()}`

			`sharding_specs = _deepcopy_dict_vals(self.sharding_specs) if self.sharding_specs else None`
			`communication_actions = _deepcopy_dict_vals(self.communication_actions) if self.communication_actions else None`
			`resharding_costs = _deepcopy_dict_vals(self.resharding_costs) if self.resharding_costs else None`
			`compute_cost = deepcopy(self.compute_cost)`
			`communication_cost = deepcopy(self.communication_cost)`
			`memory_cost = deepcopy(self.memory_cost)`

[autoparallel] collated all deprecated files (#1700) * [autoparallel] collated all deprecated files * polish code 2 years ago			`return ShardingStrategy(name=self.name,`
			`sharding_specs=sharding_specs,`
			`compute_cost=compute_cost,`
			`communication_cost=communication_cost,`
			`memory_cost=memory_cost,`
			`communication_actions=communication_actions,`
			`resharding_costs=resharding_costs)`
[autoparallel] added sharding spec conversion for linear handler (#1687) 2 years ago
[autoparallel] added new node handler (#1612) 2 years ago
[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago			`class StrategiesVector(list):`
[autoparallel] Add conv handler to generate strategies and costs info for conv (#1467) 2 years ago			`'''`
			`Each node in fx graph will have a corresponding StrategiesVector, to store all the possible`
			`strategies of the node.`

			`Argument:`
[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago			`node (Node): node for which the list of sharding strategies are generated.`
[autoparallel] Add conv handler to generate strategies and costs info for conv (#1467) 2 years ago			`'''`

[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago			`def __init__(self, node: Node):`
			`super().__init__()`
[autoparallel] Add conv handler to generate strategies and costs info for conv (#1467) 2 years ago			`self.node = node`
[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago			`# fetch its input and output nodes`
[autoparellel]add strategies constructor (#1505) * [autoparellel]add strategies constructor * remove duplicated strategies * polish code * adapt cost graph with StrategiesConstructor * polish 2 years ago			`# TODO: placeholder input nodes`
[autoparallel] integrate auto parallel with torch fx (#1479) 2 years ago			`self.predecessor_nodes = list(node._input_nodes.keys())`
[autoparallel] remove no strategy nodes (#1652) * [autoparallel] remove no strategy nodes * fix none object iteration issue 2 years ago			`if self.node.op == 'output':`
			`self.predecessor_nodes = list(node._input_nodes.keys())[:1]`
[autoparallel] add cost graph class (#1481) * [autoparallel] add cost graph class * polish code 2 years ago			`self.successor_nodes = list(node.users.keys())`
[autoparallel] Add conv handler to generate strategies and costs info for conv (#1467) 2 years ago
			`def check_merge(self):`
[autoparellel]add strategies constructor (#1505) * [autoparellel]add strategies constructor * remove duplicated strategies * polish code * adapt cost graph with StrategiesConstructor * polish 2 years ago			`merge_label = False`
			`if self.node.op == 'call_module':`
			`target = self.node.target`
			`root_module = self.node.graph.owning_module`
			`submod = root_module.get_submodule(target)`
			`submod_type = type(submod)`
[autoparallel] add reshape handler (#1594) * [autoparallel] add reshape handler * polish code 2 years ago			`# merge elementwise module node into source nodes`
			`# we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.`
[autoparellel]add strategies constructor (#1505) * [autoparellel]add strategies constructor * remove duplicated strategies * polish code * adapt cost graph with StrategiesConstructor * polish 2 years ago			`if submod_type in ELEMENTWISE_MODULE_OP:`
			`merge_label = True`

			`if self.node.op == 'call_function':`
[autoparallel] add reshape handler (#1594) * [autoparallel] add reshape handler * polish code 2 years ago			`# we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.`
[autoparellel]add strategies constructor (#1505) * [autoparellel]add strategies constructor * remove duplicated strategies * polish code * adapt cost graph with StrategiesConstructor * polish 2 years ago			`if self.node.target in ELEMENTWISE_FUNC_OP:`
			`merge_label = True`
[autoparallel] add bcast op handler (#1600) * [autoparallel] add bcast op handler * polish code * add more BCAST FUNC OP * polish code * add exception handler * polish 2 years ago			`# we could merge bcast op if the rhs is a scalar, because it will fall back to the element-wise case.`
			`if self.node.target in BCAST_FUNC_OP and len(self.predecessor_nodes) == 1:`
			`merge_label = True`
[autoparallel] add reshape handler (#1594) * [autoparallel] add reshape handler * polish code 2 years ago			`# we could merge reshape op, because the output sharding spec of reshape op is always fully replicated.`
			`if self.node.target in RESHAPE_FUNC_OP:`
			`merge_label = True`
[autoparellel]add strategies constructor (#1505) * [autoparellel]add strategies constructor * remove duplicated strategies * polish code * adapt cost graph with StrategiesConstructor * polish 2 years ago
			`return merge_label`