ColossalAI/colossalai/auto_parallel/tensor_shard/utils/reshape.py

from enum import Enum
from typing import Dict, List, Tuple

import torch


class PreviousStatus(Enum):
    """
    This class shows the status of previous comparison.
    """
    RESET = 0
    # ORIGIN means the dimension size of original tensor is larger in the previous comparison.
    ORIGIN = 1
    # TGT means the dimension size of target tensor is larger in the previous comparison.
    TGT = 2


def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> Dict[Tuple[int], Tuple[int]]:
    """
    This method is used to detect the reshape mapping between original tensor and target tensor.

    Returns:
        reshape_mapping_dict: The dictionary shows how a tuple of origin dims(keys) mapping to the related
        target dims(values) during reshaping operation.
    Examples:
        import torch
        origin_shape = torch.Size([4, 4, 4])
        tgt_shape = torch.Size([2, 8, 2, 2])
        reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
        print(reshape_mapping_dict)
    Output:
        {(2,): (3, 2), (1, 0): (1,), (0,): (0, 1)}
    """

    # reverse the shape object
    origin_shape = list(origin_shape)
    tgt_shape = list(tgt_shape)
    origin_shape.reverse()
    tgt_shape.reverse()

    # initialize arguments
    reshape_mapping_dict = {}
    origin_len = len(origin_shape)
    tgt_len = len(tgt_shape)
    origin_index = 0
    tgt_index = 0
    original_dimension_size = origin_shape[origin_index]
    tgt_dimension_size = tgt_shape[tgt_index]
    tgt_dims = [tgt_len - tgt_index - 1]
    origin_dims = [origin_len - origin_index - 1]
    previous_label = PreviousStatus.RESET

    while origin_index != len(origin_shape) or tgt_index != len(tgt_shape):
        if original_dimension_size == tgt_dimension_size:
            reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
            # if the origin_dims has no element, it means the original tensor has been fully matched.
            # Therefore, we do not have to increase the origin_index for that case.
            if len(origin_dims) > 0:
                origin_index += 1
            # if the tgt_dims has no element, it means the original tensor has been fully matched.
            # Therefore, we do not have to increase the tgt_index for that case.
            if len(tgt_dims) > 0:
                tgt_index += 1
            # the last step of loop should always end with condition
            # so we need to manually skip the preparation for next step
            # in the last step.
            if origin_index == len(origin_shape) and tgt_index == len(tgt_shape):
                continue

            # If origin_index equals to origin_len, we just need to set the original_dimension_size
            # to 1 to match the remaining '1's in the target tensor shape.
            if origin_index == len(origin_shape):
                original_dimension_size = 1
                origin_dims = []
            else:
                original_dimension_size = origin_shape[origin_index]
                origin_dims = [origin_len - origin_index - 1]

            # If tgt_index equals to tgt_len, we just need to set the tgt_dimension_size
            # to 1 to match the remaining '1's in the original tensor shape.
            if tgt_index == len(tgt_shape):
                tgt_dimension_size = 1
                tgt_dims = []
            else:
                tgt_dimension_size = tgt_shape[tgt_index]
                tgt_dims = [tgt_len - tgt_index - 1]

            previous_label = PreviousStatus.RESET

        elif original_dimension_size > tgt_dimension_size:
            tgt_index += 1

            if previous_label == PreviousStatus.TGT:
                # if the target dimension size is larger in the previous comparison, which means
                # the origin dimension size has already accumulated larger than target dimension size, so
                # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
                original_dimension_size = original_dimension_size // tgt_dimension_size
                origin_dims = [origin_len - origin_index - 1]
                tgt_dimension_size = tgt_shape[tgt_index]
                tgt_dims = [tgt_len - tgt_index - 1, tgt_len - tgt_index]
                # reset the previous_label after offloading the origin dims and tgt dims
                previous_label = PreviousStatus.RESET
            else:
                # accumulate the tgt_dimension_size until tgt_dimension_size larger than original_dimension_size
                tgt_dimension_size *= tgt_shape[tgt_index]
                tgt_dims.append(tgt_len - tgt_index - 1)
                previous_label = PreviousStatus.ORIGIN

        else:
            origin_index += 1

            if previous_label == PreviousStatus.ORIGIN:
                # if the origin element is larger in the previous comparison, which means
                # the target element has already accumulated larger than origin element, so
                # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
                reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
                tgt_dimension_size = tgt_dimension_size // original_dimension_size
                tgt_dims = [tgt_len - tgt_index - 1]
                original_dimension_size = origin_shape[origin_index]
                origin_dims = [origin_len - origin_index - 1, origin_len - origin_index]
                # reset the previous_label after offloading the origin dims and tgt dims
                previous_label = PreviousStatus.RESET
            else:
                # accumulate the original_dimension_size until original_dimension_size larger than tgt_dimension_size
                original_dimension_size *= origin_shape[origin_index]
                origin_dims.append(origin_len - origin_index - 1)
                previous_label = PreviousStatus.TGT

    return reshape_mapping_dict


def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
                               reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> bool:
    """
    This method is used to check whether the reshape operation could implement without converting
    the input to fully replicated status.

    Rule:
        For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
        the function will return false.
        To illustrate this issue, there are two cases to analyze:
        1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
        operation without distributed tensor.
        2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
        consistency process, torch.cat will be implemented on the sharded dim, and everything after the sharded
        dim get recovered.

    Examples:
        # the second dimension of the input has been sharded.
        input_dim_partition_dict = {1: [1]}
        origin_shape = torch.Size([8, 4, 2])
        tgt_shape = torch.Size([2, 4, 8])
        reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
        # {(2, 1): (2,), (0,): (1, 0)}
        # the sharded dim of input is 1, which is the minimum element of the tuple (2, 1),
        # so we do not have to convert the input to fully replicated status.
        print(check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict))

    Output:
        True
    """
    sharded_dims = list(input_dim_partition_dict.keys())
    for input_dims in reshape_mapping_dict.keys():
        # if input_dims has no element, we could just skip this iteration.
        if len(input_dims) == 0:
            continue
        min_element = min(input_dims)
        for dim in input_dims:
            if dim in sharded_dims and dim is not min_element:
                return False
    return True


def infer_output_dim_partition_dict(input_dim_partition_dict: Dict[int, List[int]],
                                    reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> Dict[Tuple[int], Tuple[int]]:
    """
    This method is used to infer the output dim partition dict for a reshape operation,
    given the input dim partition dict and reshape mapping dict.
    """
    assert check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict), \
        'we only infer output dim partition dict for the reshape operation could keep sharding spec.'
    sharded_dims = list(input_dim_partition_dict.keys())
    output_dim_partition_dict = {}
    for input_dims, output_dims in reshape_mapping_dict.items():
        for dim in input_dims:
            if dim in sharded_dims:
                output_dim_partition_dict[min(output_dims)] = input_dim_partition_dict[dim]
                # we could break because input dims cannot contain two sharded dims, otherwise
                # the keep sharding status check will fail.
                break
    return output_dim_partition_dict
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`from enum import Enum`
			`from typing import Dict, List, Tuple`

			`import torch`


			`class PreviousStatus(Enum):`
			`"""`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`This class shows the status of previous comparison.`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`"""`
			`RESET = 0`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`# ORIGIN means the dimension size of original tensor is larger in the previous comparison.`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`ORIGIN = 1`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`# TGT means the dimension size of target tensor is larger in the previous comparison.`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`TGT = 2`


			`def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> Dict[Tuple[int], Tuple[int]]:`
			`"""`
			`This method is used to detect the reshape mapping between original tensor and target tensor.`

			`Returns:`
			`reshape_mapping_dict: The dictionary shows how a tuple of origin dims(keys) mapping to the related`
			`target dims(values) during reshaping operation.`
			`Examples:`
			`import torch`
			`origin_shape = torch.Size([4, 4, 4])`
			`tgt_shape = torch.Size([2, 8, 2, 2])`
			`reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)`
			`print(reshape_mapping_dict)`
			`Output:`
			`{(2,): (3, 2), (1, 0): (1,), (0,): (0, 1)}`
			`"""`

			`# reverse the shape object`
			`origin_shape = list(origin_shape)`
			`tgt_shape = list(tgt_shape)`
			`origin_shape.reverse()`
			`tgt_shape.reverse()`

			`# initialize arguments`
			`reshape_mapping_dict = {}`
			`origin_len = len(origin_shape)`
			`tgt_len = len(tgt_shape)`
			`origin_index = 0`
			`tgt_index = 0`
			`original_dimension_size = origin_shape[origin_index]`
			`tgt_dimension_size = tgt_shape[tgt_index]`
			`tgt_dims = [tgt_len - tgt_index - 1]`
			`origin_dims = [origin_len - origin_index - 1]`
			`previous_label = PreviousStatus.RESET`

			`while origin_index != len(origin_shape) or tgt_index != len(tgt_shape):`
			`if original_dimension_size == tgt_dimension_size:`
			`reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)`
[autoparallel] adapt solver with self attention (#2037) * [autoparallel] adapt solver with self attention * polish code 2 years ago			`# if the origin_dims has no element, it means the original tensor has been fully matched.`
			`# Therefore, we do not have to increase the origin_index for that case.`
			`if len(origin_dims) > 0:`
			`origin_index += 1`
			`# if the tgt_dims has no element, it means the original tensor has been fully matched.`
			`# Therefore, we do not have to increase the tgt_index for that case.`
			`if len(tgt_dims) > 0:`
			`tgt_index += 1`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`# the last step of loop should always end with condition`
			`# so we need to manually skip the preparation for next step`
			`# in the last step.`
[autoparallel] adapt solver with self attention (#2037) * [autoparallel] adapt solver with self attention * polish code 2 years ago			`if origin_index == len(origin_shape) and tgt_index == len(tgt_shape):`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`continue`
[autoparallel] adapt solver with self attention (#2037) * [autoparallel] adapt solver with self attention * polish code 2 years ago
			`# If origin_index equals to origin_len, we just need to set the original_dimension_size`
			`# to 1 to match the remaining '1's in the target tensor shape.`
			`if origin_index == len(origin_shape):`
			`original_dimension_size = 1`
			`origin_dims = []`
			`else:`
			`original_dimension_size = origin_shape[origin_index]`
			`origin_dims = [origin_len - origin_index - 1]`

			`# If tgt_index equals to tgt_len, we just need to set the tgt_dimension_size`
			`# to 1 to match the remaining '1's in the original tensor shape.`
			`if tgt_index == len(tgt_shape):`
			`tgt_dimension_size = 1`
			`tgt_dims = []`
			`else:`
			`tgt_dimension_size = tgt_shape[tgt_index]`
			`tgt_dims = [tgt_len - tgt_index - 1]`

[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`previous_label = PreviousStatus.RESET`

			`elif original_dimension_size > tgt_dimension_size:`
			`tgt_index += 1`

			`if previous_label == PreviousStatus.TGT:`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`# if the target dimension size is larger in the previous comparison, which means`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`# the origin dimension size has already accumulated larger than target dimension size, so`
			`# we need to offload the origin dims and tgt dims into the reshape_mapping_dict.`
			`reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)`
			`original_dimension_size = original_dimension_size // tgt_dimension_size`
			`origin_dims = [origin_len - origin_index - 1]`
			`tgt_dimension_size = tgt_shape[tgt_index]`
			`tgt_dims = [tgt_len - tgt_index - 1, tgt_len - tgt_index]`
			`# reset the previous_label after offloading the origin dims and tgt dims`
			`previous_label = PreviousStatus.RESET`
			`else:`
			`# accumulate the tgt_dimension_size until tgt_dimension_size larger than original_dimension_size`
			`tgt_dimension_size *= tgt_shape[tgt_index]`
			`tgt_dims.append(tgt_len - tgt_index - 1)`
			`previous_label = PreviousStatus.ORIGIN`

			`else:`
			`origin_index += 1`

			`if previous_label == PreviousStatus.ORIGIN:`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`# if the origin element is larger in the previous comparison, which means`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`# the target element has already accumulated larger than origin element, so`
			`# we need to offload the origin dims and tgt dims into the reshape_mapping_dict.`
			`reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)`
			`tgt_dimension_size = tgt_dimension_size // original_dimension_size`
			`tgt_dims = [tgt_len - tgt_index - 1]`
			`original_dimension_size = origin_shape[origin_index]`
			`origin_dims = [origin_len - origin_index - 1, origin_len - origin_index]`
			`# reset the previous_label after offloading the origin dims and tgt dims`
			`previous_label = PreviousStatus.RESET`
			`else:`
			`# accumulate the original_dimension_size until original_dimension_size larger than tgt_dimension_size`
			`original_dimension_size *= origin_shape[origin_index]`
			`origin_dims.append(origin_len - origin_index - 1)`
			`previous_label = PreviousStatus.TGT`

			`return reshape_mapping_dict`


			`def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],`
			`reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> bool:`
			`"""`
			`This method is used to check whether the reshape operation could implement without converting`
			`the input to fully replicated status.`

			`Rule:`
			`For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,`
			`the function will return false.`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`To illustrate this issue, there are two cases to analyze:`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal`
			`operation without distributed tensor.`
			`2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape`
			`consistency process, torch.cat will be implemented on the sharded dim, and everything after the sharded`
			`dim get recovered.`

			`Examples:`
			`# the second dimension of the input has been sharded.`
			`input_dim_partition_dict = {1: [1]}`
			`origin_shape = torch.Size([8, 4, 2])`
			`tgt_shape = torch.Size([2, 4, 8])`
			`reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)`
			`# {(2, 1): (2,), (0,): (1, 0)}`
			`# the sharded dim of input is 1, which is the minimum element of the tuple (2, 1),`
			`# so we do not have to convert the input to fully replicated status.`
			`print(check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict))`

			`Output:`
			`True`
			`"""`
			`sharded_dims = list(input_dim_partition_dict.keys())`
			`for input_dims in reshape_mapping_dict.keys():`
[autoparallel] adapt solver with self attention (#2037) * [autoparallel] adapt solver with self attention * polish code 2 years ago			`# if input_dims has no element, we could just skip this iteration.`
			`if len(input_dims) == 0:`
			`continue`
[autoparallel] add experimental view handler (#2011) * [autoparallel] add experimental view handler * polish * polish * polish code * rename variables 2 years ago			`min_element = min(input_dims)`
			`for dim in input_dims:`
			`if dim in sharded_dims and dim is not min_element:`
			`return False`
			`return True`


			`def infer_output_dim_partition_dict(input_dim_partition_dict: Dict[int, List[int]],`
			`reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> Dict[Tuple[int], Tuple[int]]:`
			`"""`
			`This method is used to infer the output dim partition dict for a reshape operation,`
			`given the input dim partition dict and reshape mapping dict.`
			`"""`
			`assert check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict), \`
			`'we only infer output dim partition dict for the reshape operation could keep sharding spec.'`
			`sharded_dims = list(input_dim_partition_dict.keys())`
			`output_dim_partition_dict = {}`
			`for input_dims, output_dims in reshape_mapping_dict.items():`
			`for dim in input_dims:`
			`if dim in sharded_dims:`
			`output_dim_partition_dict[min(output_dims)] = input_dim_partition_dict[dim]`
			`# we could break because input dims cannot contain two sharded dims, otherwise`
			`# the keep sharding status check will fail.`
			`break`
			`return output_dim_partition_dict`