ColossalAI/colossalai/tensor/d_tensor/sharding_spec.py

from copy import deepcopy
from typing import Dict, List

from ..utils import merge_same_dim_mesh_list
from .misc import ShardingOutOfIndexError

__all__ = ['DimSpec', 'ShardingException', 'ShardingSpec']

ALLGATHER_COST = 20
SHARD_COST = 5
STEP_PENALTY = 6
NAN = 'nan'


class DimSpec:
    '''
    Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
    logical device mesh and give a method to compute the difference between them.
    This class is used internally in ShardingSpec.

    Argument:
        shard_list(List[int]): if shard_list is None, the dim spec will be 'R' type.
            Otherwise, the element in shard_list means the data will be sharded in that dimension.
    '''

    def __init__(self, shard_list):
        self.is_replica = len(shard_list) == 0
        self.shard_list = shard_list
        self.build_difference_2d_dict()

    def __eq__(self, other):
        return str(self) == str(other)

    def __repr__(self):
        if self.is_replica:
            return 'R'
        target = 'S'
        for dim in self.shard_list:
            target += str(dim)
        return target

    def _convert_str_to_shard_list(self, str_spec):
        '''
        Convert str_spec into shard_list.

        Argument:
            str_spec(str): dim spec in str type.
        '''

        if str_spec == 'R':
            return []
        if str_spec == 'S0':
            return [0]
        if str_spec == 'S1':
            return [1]
        if str_spec == 'S01':
            return [0, 1]

    def build_difference_2d_dict(self):
        '''
        Build a difference mapping for 2D device mesh case. It will be used to
        compute the difference between DimSpec pairs.
        '''

        source_spec_list = ['R', 'S0', 'S1', 'S01']
        target_spec_list = ['R', 'S0', 'S1', 'S01']
        difference_dict = {}
        for source_spec in source_spec_list:
            for target_spec in target_spec_list:
                legal_sharding_dims = []
                spec_pair = (deepcopy(source_spec), deepcopy(target_spec))
                source_shard_list = self._convert_str_to_shard_list(source_spec)
                target_shard_list = self._convert_str_to_shard_list(target_spec)

                # source same as target
                if source_shard_list == target_shard_list:
                    difference = 0

                # all_gather(source) -> target
                elif len(source_shard_list
                        ) == len(target_shard_list) + 1 and source_shard_list[:-1] == target_shard_list:
                    difference = ALLGATHER_COST

                # shard(source) -> target
                elif len(source_shard_list) == len(
                        target_shard_list) - 1 and source_shard_list == target_shard_list[:-1] and target_shard_list[
                            -1] not in source_shard_list:
                    difference = SHARD_COST

                # S1 -> S0 or S0 -> S1
                elif len(source_shard_list) == len(target_shard_list):
                    # source -> R -> target
                    difference = ALLGATHER_COST + STEP_PENALTY + SHARD_COST

                # R -> S01
                elif len(source_shard_list) == len(target_shard_list) - 2:
                    difference = SHARD_COST + STEP_PENALTY + SHARD_COST

                # S01 -> R
                elif len(source_shard_list) == len(target_shard_list) + 2:
                    difference = ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST

                # S1 -> S01
                elif len(source_shard_list) == len(target_shard_list) - 1:
                    difference = ALLGATHER_COST + STEP_PENALTY + SHARD_COST + STEP_PENALTY + SHARD_COST

                # S01 -> S1
                elif len(source_shard_list) == len(target_shard_list) + 1:
                    difference = ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST + STEP_PENALTY + SHARD_COST

                else:
                    difference = NAN
                difference_dict[spec_pair] = difference

        self.difference_dict = difference_dict

    def dim_diff(self, other):
        '''
        The difference between two DimSpec.

        Argument:
            other(DimSpec): the dim spec to compare with.

        Return:
            difference(int): the difference between two _DimSpec.

        Example:
            ```python
            dim_spec = DimSpec([0])
            other_dim_spec = DimSpec([0, 1])
            print(dim_spec.difference(other_dim_spec))
            # output: 5
            ```
        '''
        difference = self.difference_dict[(str(self), str(other))]
        return difference


class ShardingSpec:
    '''
    Sharding spec describes how to shard a tensor with dim_size dimensions. The sharding sequence looks like
    [R, R, S0, S1], which means

    Argument:
        dim_size (int): The number of dimensions of the tensor to be sharded.
        dim_partition_dict (Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
            and the value of the key describe which logical axis will be sharded in that dimension. Defaults to None.
            E.g. {0: [0, 1]} means the first dimension of the tensor will be sharded in logical axis 0 and 1.
        sharding_sequence (List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
            Generally, users should specify either dim_partition_dict or sharding_sequence.
            If both are given, users must ensure that they are consistent with each other. Defaults to None.
    '''

    def __init__(self,
                 dim_size: int,
                 dim_partition_dict: Dict[int, List[int]] = None,
                 sharding_sequence: List[DimSpec] = None):
        self.dims = dim_size
        self.dim_partition_dict = dim_partition_dict
        self.sharding_sequence = sharding_sequence
        if self.sharding_sequence is None:
            assert self.dim_partition_dict is not None, f'dim_partition_dict should not be None, if sharding_sequence is NoneType object.'
            self.dim_partition_dict = merge_same_dim_mesh_list(dim_size=self.dims,
                                                               dim_partition_dict=self.dim_partition_dict)
            self.sharding_sequence = self.convert_dict_to_shard_sequence()

        elif self.dim_partition_dict is None:
            assert self.sharding_sequence is not None, f'sharding_sequence should not be None, if dim_partition_dict is NoneType object.'
            self.dim_partition_dict = self.convert_shard_sequence_to_dict()

        self._sanity_check()

    def _sanity_check(self):
        if len(self.sharding_sequence) > self.dims:
            raise ShardingOutOfIndexError(
                f'sharding_sequence should have {self.dims} elements, but got index {len(self.sharding_sequence)}.')

        if list(self.dim_partition_dict.keys()) and max(list(self.dim_partition_dict.keys())) >= self.dims:
            raise ShardingOutOfIndexError(
                f'the key of dim_partition_dict should be less than {self.dims}, but got {max(list(self.dim_partition_dict.keys()))}.'
            )

    def __repr__(self):
        res_list = ["ShardingSpec:"]
        res_list.append(f"\n\tshard_sequence: " + ",".join(str(dimspec) for dimspec in self.sharding_sequence))
        return ' '.join(res_list)

    def convert_dict_to_shard_sequence(self):
        '''
        Convert dim_partition_dict into list of DimSpec, and assign it to sharding_sequence.
        '''
        sharding_sequence = [DimSpec([])] * self.dims
        for dim, shard_list in self.dim_partition_dict.items():
            sharding_sequence[dim] = DimSpec(shard_list)
        return sharding_sequence

    def convert_shard_sequence_to_dict(self):
        '''
        Convert sharding_sequence into dim_partition_dict.
        '''
        new_dim_partition_dict = {}
        for index, dim_spec in enumerate(self.sharding_sequence):
            if not dim_spec.is_replica:
                if index not in new_dim_partition_dict:
                    new_dim_partition_dict[index] = []
                new_dim_partition_dict[index].extend(dim_spec.shard_list)
        return new_dim_partition_dict

    def spec_diff(self, other):
        '''
        This function is a naive version of difference computation. It just simply accumulates difference every dimension between the
        pair of sharding sequence.

        Example:
            ```python
            dim_partition_dict = {0: [0, 1]}
            # DistSpec:
            #     shard_sequence: S01,R,R
            #     device_mesh_shape: (4, 4)
            sharding_spec = ShardingSpec(device_mesh, entire_shape, dim_partition_dict)
            dim_partition_dict_to_compare = {0: [0], 1: [1]}
            # DistSpec:
            #     shard_sequence: S0,S1,R
            #     device_mesh_shape: (4, 4)
            sharding_spec_to_compare = ShardingSpec(device_mesh, entire_shape, dim_partition_dict_to_compare)
            print(sharding_spec.sharding_sequence_difference(sharding_spec_to_compare))
            # output: 25
            ```
        Argument:
            other(ShardingSpec): The ShardingSpec to compared with.

        Return:
            difference(int): Difference between two ShardingSpec.
        '''
        assert len(self.sharding_sequence) == len(
            other.sharding_sequence), f'Cannot compare difference for two sharding specs with different length.'
        difference = 0
        for orig_dim_spec, other_dim_spec in zip(self.sharding_sequence, other.sharding_sequence):
            difference += orig_dim_spec.dim_diff(other_dim_spec)
        return difference
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`from copy import deepcopy`
			`from typing import Dict, List`

			`from ..utils import merge_same_dim_mesh_list`
			`from .misc import ShardingOutOfIndexError`

			`__all__ = ['DimSpec', 'ShardingException', 'ShardingSpec']`

			`ALLGATHER_COST = 20`
			`SHARD_COST = 5`
			`STEP_PENALTY = 6`
			`NAN = 'nan'`


			`class DimSpec:`
			`'''`
[doc] Fix typo under colossalai and doc(#3618) * Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402 2023-04-26 03:38:43 +00:00			`Sharding spec for single dimension of the sharded tensor describe the sharding dimension of`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`logical device mesh and give a method to compute the difference between them.`
			`This class is used internally in ShardingSpec.`

			`Argument:`
			`shard_list(List[int]): if shard_list is None, the dim spec will be 'R' type.`
			`Otherwise, the element in shard_list means the data will be sharded in that dimension.`
			`'''`

			`def __init__(self, shard_list):`
			`self.is_replica = len(shard_list) == 0`
			`self.shard_list = shard_list`
			`self.build_difference_2d_dict()`

			`def __eq__(self, other):`
			`return str(self) == str(other)`

			`def __repr__(self):`
			`if self.is_replica:`
			`return 'R'`
			`target = 'S'`
			`for dim in self.shard_list:`
			`target += str(dim)`
			`return target`

			`def _convert_str_to_shard_list(self, str_spec):`
			`'''`
[nfc]fix typo colossalai/pipeline tensor nn (#3899) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped * fix typo colossalai/pipeline tensor nn 2023-06-06 06:07:36 +00:00			`Convert str_spec into shard_list.`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00
			`Argument:`
			`str_spec(str): dim spec in str type.`
			`'''`

			`if str_spec == 'R':`
			`return []`
			`if str_spec == 'S0':`
			`return [0]`
			`if str_spec == 'S1':`
			`return [1]`
			`if str_spec == 'S01':`
			`return [0, 1]`

			`def build_difference_2d_dict(self):`
			`'''`
[nfc]fix typo colossalai/pipeline tensor nn (#3899) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped * fix typo colossalai/pipeline tensor nn 2023-06-06 06:07:36 +00:00			`Build a difference mapping for 2D device mesh case. It will be used to`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`compute the difference between DimSpec pairs.`
			`'''`

			`source_spec_list = ['R', 'S0', 'S1', 'S01']`
			`target_spec_list = ['R', 'S0', 'S1', 'S01']`
			`difference_dict = {}`
			`for source_spec in source_spec_list:`
			`for target_spec in target_spec_list:`
			`legal_sharding_dims = []`
			`spec_pair = (deepcopy(source_spec), deepcopy(target_spec))`
			`source_shard_list = self._convert_str_to_shard_list(source_spec)`
			`target_shard_list = self._convert_str_to_shard_list(target_spec)`

			`# source same as target`
			`if source_shard_list == target_shard_list:`
			`difference = 0`

			`# all_gather(source) -> target`
			`elif len(source_shard_list`
			`) == len(target_shard_list) + 1 and source_shard_list[:-1] == target_shard_list:`
			`difference = ALLGATHER_COST`

			`# shard(source) -> target`
			`elif len(source_shard_list) == len(`
			`target_shard_list) - 1 and source_shard_list == target_shard_list[:-1] and target_shard_list[`
			`-1] not in source_shard_list:`
			`difference = SHARD_COST`

			`# S1 -> S0 or S0 -> S1`
			`elif len(source_shard_list) == len(target_shard_list):`
			`# source -> R -> target`
			`difference = ALLGATHER_COST + STEP_PENALTY + SHARD_COST`

			`# R -> S01`
			`elif len(source_shard_list) == len(target_shard_list) - 2:`
			`difference = SHARD_COST + STEP_PENALTY + SHARD_COST`

			`# S01 -> R`
			`elif len(source_shard_list) == len(target_shard_list) + 2:`
			`difference = ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST`

			`# S1 -> S01`
			`elif len(source_shard_list) == len(target_shard_list) - 1:`
			`difference = ALLGATHER_COST + STEP_PENALTY + SHARD_COST + STEP_PENALTY + SHARD_COST`

			`# S01 -> S1`
			`elif len(source_shard_list) == len(target_shard_list) + 1:`
			`difference = ALLGATHER_COST + STEP_PENALTY + ALLGATHER_COST + STEP_PENALTY + SHARD_COST`

			`else:`
			`difference = NAN`
			`difference_dict[spec_pair] = difference`

			`self.difference_dict = difference_dict`

			`def dim_diff(self, other):`
			`'''`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			`The difference between two DimSpec.`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00
			`Argument:`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			`other(DimSpec): the dim spec to compare with.`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00
			`Return:`
			`difference(int): the difference between two _DimSpec.`

			`Example:`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			```python
			`dim_spec = DimSpec([0])`
			`other_dim_spec = DimSpec([0, 1])`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`print(dim_spec.difference(other_dim_spec))`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			`# output: 5`
			```
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`'''`
			`difference = self.difference_dict[(str(self), str(other))]`
			`return difference`


			`class ShardingSpec:`
			`'''`
			`Sharding spec describes how to shard a tensor with dim_size dimensions. The sharding sequence looks like`
			`[R, R, S0, S1], which means`

			`Argument:`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			`dim_size (int): The number of dimensions of the tensor to be sharded.`
			`dim_partition_dict (Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,`
			`and the value of the key describe which logical axis will be sharded in that dimension. Defaults to None.`
			`E.g. {0: [0, 1]} means the first dimension of the tensor will be sharded in logical axis 0 and 1.`
			`sharding_sequence (List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].`
			`Generally, users should specify either dim_partition_dict or sharding_sequence.`
			`If both are given, users must ensure that they are consistent with each other. Defaults to None.`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`'''`

			`def __init__(self,`
			`dim_size: int,`
			`dim_partition_dict: Dict[int, List[int]] = None,`
			`sharding_sequence: List[DimSpec] = None):`
			`self.dims = dim_size`
			`self.dim_partition_dict = dim_partition_dict`
			`self.sharding_sequence = sharding_sequence`
			`if self.sharding_sequence is None:`
			`assert self.dim_partition_dict is not None, f'dim_partition_dict should not be None, if sharding_sequence is NoneType object.'`
			`self.dim_partition_dict = merge_same_dim_mesh_list(dim_size=self.dims,`
			`dim_partition_dict=self.dim_partition_dict)`
			`self.sharding_sequence = self.convert_dict_to_shard_sequence()`

			`elif self.dim_partition_dict is None:`
			`assert self.sharding_sequence is not None, f'sharding_sequence should not be None, if dim_partition_dict is NoneType object.'`
			`self.dim_partition_dict = self.convert_shard_sequence_to_dict()`

			`self._sanity_check()`

			`def _sanity_check(self):`
			`if len(self.sharding_sequence) > self.dims:`
			`raise ShardingOutOfIndexError(`
			`f'sharding_sequence should have {self.dims} elements, but got index {len(self.sharding_sequence)}.')`

[DTensor] refactor CommSpec (#3034) 2023-03-08 02:45:31 +00:00			`if list(self.dim_partition_dict.keys()) and max(list(self.dim_partition_dict.keys())) >= self.dims:`
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`raise ShardingOutOfIndexError(`
			`f'the key of dim_partition_dict should be less than {self.dims}, but got {max(list(self.dim_partition_dict.keys()))}.'`
			`)`

			`def __repr__(self):`
			`res_list = ["ShardingSpec:"]`
			`res_list.append(f"\n\tshard_sequence: " + ",".join(str(dimspec) for dimspec in self.sharding_sequence))`
			`return ' '.join(res_list)`

			`def convert_dict_to_shard_sequence(self):`
			`'''`
			`Convert dim_partition_dict into list of DimSpec, and assign it to sharding_sequence.`
			`'''`
			`sharding_sequence = [DimSpec([])] * self.dims`
			`for dim, shard_list in self.dim_partition_dict.items():`
			`sharding_sequence[dim] = DimSpec(shard_list)`
			`return sharding_sequence`

			`def convert_shard_sequence_to_dict(self):`
			`'''`
			`Convert sharding_sequence into dim_partition_dict.`
			`'''`
			`new_dim_partition_dict = {}`
			`for index, dim_spec in enumerate(self.sharding_sequence):`
			`if not dim_spec.is_replica:`
			`if index not in new_dim_partition_dict:`
			`new_dim_partition_dict[index] = []`
			`new_dim_partition_dict[index].extend(dim_spec.shard_list)`
			`return new_dim_partition_dict`

			`def spec_diff(self, other):`
			`'''`
			`This function is a naive version of difference computation. It just simply accumulates difference every dimension between the`
			`pair of sharding sequence.`

			`Example:`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			```python
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`dim_partition_dict = {0: [0, 1]}`
			`# DistSpec:`
			`# shard_sequence: S01,R,R`
			`# device_mesh_shape: (4, 4)`
			`sharding_spec = ShardingSpec(device_mesh, entire_shape, dim_partition_dict)`
			`dim_partition_dict_to_compare = {0: [0], 1: [1]}`
			`# DistSpec:`
			`# shard_sequence: S0,S1,R`
			`# device_mesh_shape: (4, 4)`
			`sharding_spec_to_compare = ShardingSpec(device_mesh, entire_shape, dim_partition_dict_to_compare)`
			`print(sharding_spec.sharding_sequence_difference(sharding_spec_to_compare))`
[dtensor] polish sharding spec docstring (#3838) * [dtensor] polish sharding spec docstring * [dtensor] polish sharding spec example docstring 2023-05-25 05:09:42 +00:00			`# output: 25`
			```
[DTensor] refactor sharding spec (#2987) * [autoparallel] refactor sharding spec * rename function name 2023-03-07 03:08:11 +00:00			`Argument:`
			`other(ShardingSpec): The ShardingSpec to compared with.`

			`Return:`
			`difference(int): Difference between two ShardingSpec.`
			`'''`
			`assert len(self.sharding_sequence) == len(`
			`other.sharding_sequence), f'Cannot compare difference for two sharding specs with different length.'`
			`difference = 0`
			`for orig_dim_spec, other_dim_spec in zip(self.sharding_sequence, other.sharding_sequence):`
			`difference += orig_dim_spec.dim_diff(other_dim_spec)`
			`return difference`