ColossalAI/colossalai/tensor/spec.py

import torch.distributed as dist
from enum import Enum
from typing import List
from colossalai.context.parallel_mode import ParallelMode
from colossalai.tensor.distspec import _DistSpec, DistPlacementPattern


class ComputePattern(Enum):
    TP1D = 0
    ZeRO = 1
    DP = 2


class ParallelAction(object):

    def __init__(self,
                 priority=0,
                 compute_pattern=ComputePattern.DP,
                 parallel_mode=ParallelMode.DATA,
                 gather_out=True) -> None:
        self.priority = priority
        self.compute_pattern = compute_pattern
        self.parallel_mode = parallel_mode
        self.gather_out = gather_out


class TensorSpec(object):
    """
    It contains two aspects of information: 
    First, How are tensors distributed in Heterougenous memory space.
    Second, if the tensor is a model parameter, the Spec contains the 
    parallel computation pattern of the Operator (Layer).
    We have to consider the hybrid parallel mode.
    """

    # a list of parallel actions.
    # For example: On 8 GPUs, a hybrid parallel strategy is applied using
    # using ZeRO with DP-degree = 4 and 1DRowTP with TP-degree = 2.
    # parallel_action_list = [
    # ParallelAction(10, ComputePattern.ZeRO, gpc.get_group(ParallelMode.DATA)),
    # ParallelAction(1, ComputePattern.TP1D_Linear, gpc.get_group(ParallelMode.PARALLEL_1D))
    # ]
    # When the ColoTensor is initialized,
    # we first splitting tensor according to ParallelAction of ZeRO,
    # then splitting tensor according to ParallelAction of TP1D_Linear.
    # During Linear computation
    # Before Linear Op, we gather the tensors according to ZeRO.
    # We perform Linear Op according to compute pattern of TP1D_Linear.
    # After Linear Op, we split the tensors according to ZeRO.

    def __init__(self, dist_spec: _DistSpec, parallel_action_list: List[ParallelAction] = []):
        self._parallel_action_list = parallel_action_list
        self.dist_spec = dist_spec
        self.sort()

    @property
    def parallel_action_list(self):
        return self._parallel_action_list

    @property
    def num_action(self):
        return len(self._parallel_action_list)

    @property
    def compute_patterns(self):
        return [parallel_action.compute_pattern for parallel_action in self._parallel_action_list]

    def sort(self):
        if len(self._parallel_action_list) > 0:
            self._parallel_action_list.sort(key=lambda parallel_action: parallel_action.priority)

    def get_action_by_compute_pattern(self, compute_pattern: ComputePattern):
        for parallel_action in self._parallel_action_list:
            if parallel_action.compute_pattern == compute_pattern:
                return parallel_action
        return None

    def get_process_group(self):
        return self.dist_spec.process_group

    def get_process_group_size(self):
        return dist.get_world_size(self.dist_spec.process_group)

    def get_placement(self):
        return self.dist_spec.placement

    def is_gathered(self):
        return self.dist_spec.placement == DistPlacementPattern.REPLICATE \
            or (len(self.dist_spec.num_partitions) == 1
                and self.dist_spec.num_partitions[0] == 1) \
            or (self.dist_spec.process_group.size() == 1)

    def is_1D_col(self):
        return self.dist_spec.placement == DistPlacementPattern.SHARD \
            and len(self.dist_spec.dims) == 1 and self.dist_spec.dims[0] == -1

    def is_1D_row(self):
        return self.dist_spec.placement == DistPlacementPattern.SHARD \
            and len(self.dist_spec.dims) == 1 and self.dist_spec.dims[0] == 0

    def has_compute_pattern(self, compute_pattern: ComputePattern):
        return self.get_action_by_compute_pattern(compute_pattern) is not None
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`import torch.distributed as dist`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`from enum import Enum`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`from typing import List`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`from colossalai.context.parallel_mode import ParallelMode`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`from colossalai.tensor.distspec import _DistSpec, DistPlacementPattern`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago
[tensor] customized op returns ColoTensor (#875) * [tensor] customized op returns ColoTensor * polish * polish code 3 years ago
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`class ComputePattern(Enum):`
[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`TP1D = 0`
			`ZeRO = 1`
			`DP = 2`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago
[Tensor] activation is an attr of ColoTensor (#897) 3 years ago
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`class ParallelAction(object):`
[tensor] customized op returns ColoTensor (#875) * [tensor] customized op returns ColoTensor * polish * polish code 3 years ago
[Tensor] activation is an attr of ColoTensor (#897) 3 years ago			`def __init__(self,`
			`priority=0,`
			`compute_pattern=ComputePattern.DP,`
			`parallel_mode=ParallelMode.DATA,`
			`gather_out=True) -> None:`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`self.priority = priority`
			`self.compute_pattern = compute_pattern`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 3 years ago			`self.parallel_mode = parallel_mode`
[tensor] add ColoTensor 1Dcol (#888) 3 years ago			`self.gather_out = gather_out`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago
[tensor] refine linear and add gather for laynorm (#893) * refine linear and add function to ColoTensor * add gather for layernorm * polish * polish 3 years ago
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 3 years ago			`class TensorSpec(object):`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`"""`
			`It contains two aspects of information:`
			`First, How are tensors distributed in Heterougenous memory space.`
			`Second, if the tensor is a model parameter, the Spec contains the`
			`parallel computation pattern of the Operator (Layer).`
			`We have to consider the hybrid parallel mode.`
			`"""`
[tensor] customized op returns ColoTensor (#875) * [tensor] customized op returns ColoTensor * polish * polish code 3 years ago
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`# a list of parallel actions.`
			`# For example: On 8 GPUs, a hybrid parallel strategy is applied using`
			`# using ZeRO with DP-degree = 4 and 1DRowTP with TP-degree = 2.`
			`# parallel_action_list = [`
			`# ParallelAction(10, ComputePattern.ZeRO, gpc.get_group(ParallelMode.DATA)),`
[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`# ParallelAction(1, ComputePattern.TP1D_Linear, gpc.get_group(ParallelMode.PARALLEL_1D))`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`# ]`
			`# When the ColoTensor is initialized,`
			`# we first splitting tensor according to ParallelAction of ZeRO,`
[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`# then splitting tensor according to ParallelAction of TP1D_Linear.`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`# During Linear computation`
			`# Before Linear Op, we gather the tensors according to ZeRO.`
[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`# We perform Linear Op according to compute pattern of TP1D_Linear.`
[tensor] an initial dea of tensor spec (#865) * a initial dea of tensor spec * polish * polish 3 years ago			`# After Linear Op, we split the tensors according to ZeRO.`
[tensor] customized op returns ColoTensor (#875) * [tensor] customized op returns ColoTensor * polish * polish code 3 years ago
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`def __init__(self, dist_spec: _DistSpec, parallel_action_list: List[ParallelAction] = []):`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 3 years ago			`self._parallel_action_list = parallel_action_list`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`self.dist_spec = dist_spec`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 3 years ago			`self.sort()`

			`@property`
			`def parallel_action_list(self):`
			`return self._parallel_action_list`

			`@property`
			`def num_action(self):`
			`return len(self._parallel_action_list)`

			`@property`
			`def compute_patterns(self):`
			`return [parallel_action.compute_pattern for parallel_action in self._parallel_action_list]`
[Tensor] activation is an attr of ColoTensor (#897) 3 years ago
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 3 years ago			`def sort(self):`
			`if len(self._parallel_action_list) > 0:`
[tensor] customized op returns ColoTensor (#875) * [tensor] customized op returns ColoTensor * polish * polish code 3 years ago			`self._parallel_action_list.sort(key=lambda parallel_action: parallel_action.priority)`

[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 3 years ago			`def get_action_by_compute_pattern(self, compute_pattern: ComputePattern):`
			`for parallel_action in self._parallel_action_list:`
			`if parallel_action.compute_pattern == compute_pattern:`
			`return parallel_action`
			`return None`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago
			`def get_process_group(self):`
			`return self.dist_spec.process_group`
add DistSpec for loss and test_model (#947) 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def get_process_group_size(self):`
			`return dist.get_world_size(self.dist_spec.process_group)`

add DistSpec for loss and test_model (#947) 3 years ago			`def get_placement(self):`
			`return self.dist_spec.placement`

			`def is_gathered(self):`
			`return self.dist_spec.placement == DistPlacementPattern.REPLICATE \`
[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`or (len(self.dist_spec.num_partitions) == 1`
add DistSpec for loss and test_model (#947) 3 years ago			`and self.dist_spec.num_partitions[0] == 1) \`
			`or (self.dist_spec.process_group.size() == 1)`

[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`def is_1D_col(self):`
add DistSpec for loss and test_model (#947) 3 years ago			`return self.dist_spec.placement == DistPlacementPattern.SHARD \`
[tensor] derive compute pattern from dist spec (#971) * derive compute pattern from dist spec * polish code 3 years ago			`and len(self.dist_spec.dims) == 1 and self.dist_spec.dims[0] == -1`

			`def is_1D_row(self):`
			`return self.dist_spec.placement == DistPlacementPattern.SHARD \`
			`and len(self.dist_spec.dims) == 1 and self.dist_spec.dims[0] == 0`

			`def has_compute_pattern(self, compute_pattern: ComputePattern):`
			`return self.get_action_by_compute_pattern(compute_pattern) is not None`