[autoparallel] refactored the data structure for sharding strategy (#1610)

2022-09-20 11:20:54 +08:00 · 2022-09-20 11:20:54 +08:00 · edb67cb378
parent 933b6c6367
commit edb67cb378
1 changed files with 42 additions and 1 deletions
--- a/colossalai/auto_parallel/solver/sharding_strategy.py
+++ b/colossalai/auto_parallel/solver/sharding_strategy.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from colossalai.tensor.sharding_spec import ShardingSpec
-from typing import Dict, List, Union, Tuple
+from typing import Dict, List, Union, Tuple, Any
 from torch.fx.node import Node
 from .constants import *
@ -37,6 +37,47 @@ class ShardingStrategy:
    input_shardings: List[ShardingSpec] = None
@dataclass
 class TrainCycleItem:
    """
    TrainCycleItem is a dataclass to store the items which have different values for the forward and backward pass
    in a training iteration.
    Args:
        fwd (Any): the item for the forward pass
        bwd (Any): the item for the backward pass
        total (Any): the total value for the forward and backward pass
    """
    fwd: Any
    bwd: Any
    total: Any
@dataclass
 class ShardingStrategy_V2:
    """
    ShardingStrategy is a dataclass to store the meta information on tensor sharding for a node.
    Args:
        name (str): express the sharding strategies in string, such as 'S0S1 = S0R x RS1'.
        output_sharding_spec (ShardingSpec): ShardingSpec of the output node.
        compute_cost (TrainCycleItem): Computation cost to complete this strategy. (default to None)
        communication_cost (TrainCycleItem): Communication cost to complete this strategy. (default to None)
        memory_cost (TrainCycleItem): Memory cost of the output node using this strategy. (default to None)
        input_sharding_specs (List(ShardingSpec)): The ShardingSpecs of the input nodes.
        input_resharding_costs (Dict[int, List[float]]): resharding_cost[i][j] means the cost of i-th argument in the output node argument list
                                                  with j-th strategy in its strategies_vector transforms to sharding spec wanted in this
                                                  strategy.(default to None)
    """
    name: str
    output_sharding_spec: ShardingSpec
    compute_cost: TrainCycleItem = None
    communication_cost: TrainCycleItem = None
    memory_cost: TrainCycleItem = None
    input_sharding_specs: List[ShardingSpec] = None
    input_resharding_costs: Dict[Node, List[float]] = None
 class StrategiesVector(list):
    '''
    Each node in fx graph will have a corresponding StrategiesVector, to store all the possible