[tensor] an initial dea of tensor spec (#865)

* a initial dea of tensor spec * polish * polish
2022-04-25 13:33:52 +08:00 · 2022-04-25 13:33:52 +08:00 · 8af5f7423d
parent 126ba573a8
commit 8af5f7423d
1 changed files with 47 additions and 0 deletions
--- a/colossalai/tensor/spec.py
+++ b/colossalai/tensor/spec.py
@ -0,0 +1,47 @@
+from enum import Enum
+from typing import Tuple, List
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+
+
+class ComputePattern(Enum):
+    TP1DRow = 1
+    TP1DCol = 2
+    ZeRO = 3
+    DP = 4
+
+
+class ParallelAction(object):
+    priority = 0
+    compute_pattern = ComputePattern.DP
+    process_group = gpc.get_group(ParallelMode.DATA)
+
+    def __init__(self, priority, compute_pattern, process_group) -> None:
+        self.priority = priority
+        self.compute_pattern = compute_pattern
+        self.process_group = process_group
+
+
+class TensorSpec(Enum):
+    """
+    It contains two aspects of information: 
+    First, How are tensors distributed in Heterougenous memory space.
+    Second, if the tensor is a model parameter, the Spec contains the 
+    parallel computation pattern of the Operator (Layer).
+    We have to consider the hybrid parallel mode.
+    """
+    # a list of parallel actions.
+    # For example: On 8 GPUs, a hybrid parallel strategy is applied using
+    # using ZeRO with DP-degree = 4 and 1DRowTP with TP-degree = 2.
+    # parallel_action_list = [
+    # ParallelAction(10, ComputePattern.ZeRO, gpc.get_group(ParallelMode.DATA)),
+    # ParallelAction(1, ComputePattern.TP1DRow, gpc.get_group(ParallelMode.PARALLEL_1D))
+    # ]
+    # When the ColoTensor is initialized,
+    # we first splitting tensor according to ParallelAction of ZeRO,
+    # then splitting tensor according to ParallelAction of TP1DRow.
+    # During Linear computation
+    # Before Linear Op, we gather the tensors according to ZeRO.
+    # We perform Linear Op according to compute pattern of TP1DRow.
+    # After Linear Op, we split the tensors according to ZeRO.
+    parallel_action_list: List[ParallelAction] = []