ColossalAI/colossalai/tensor/d_tensor/api.py

import copy
import operator
from functools import reduce
from typing import Union

import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup

from colossalai.device.device_mesh import DeviceMesh

from .layout import Layout
from .layout_converter import LayoutConverter
from .sharding_spec import ShardingSpec

layout_converter = LayoutConverter()


def clear_layout_converter():
    global layout_converter
    layout_converter.cached_solution.clear()


def is_distributed_tensor(tensor: torch.Tensor) -> bool:
    """
    Check whether the given tensor is a distributed tensor.

    Args:
        tensor (torch.Tensor): The tensor to be checked.

    Returns:
        bool: Whether the given tensor is a distributed tensor.
    """
    return hasattr(tensor, "dist_layout")


def is_sharded(dtensor: torch.Tensor) -> bool:
    """
    Check if a tensor is sharded.

    Args:
        tensor (torch.Tensor): The tensor to be checked.

    Returns:
        bool: True if the tensor is sharded, False otherwise.
    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    return list(dtensor.shape) == list(dtensor.dist_layout.global_shape)


def _hijack_detach_and_clone(dtensor: torch.Tensor) -> torch.Tensor:
    """
    Hijack the detach and clone methods of the tensor to make sure the dist_layout is copied.

    Args:
        tensor (torch.Tensor): The tensor to be hijacked.

    Returns:
        torch.Tensor: The hijacked tensor.
    """
    dtensor._old_detach = dtensor.detach
    dtensor._old_clone = dtensor.clone

    def new_detach(self):
        t_ = self._old_detach()
        t_.dist_layout = copy.deepcopy(self.dist_layout)
        return t_

    def new_clone(self, *args, **kwargs):
        t_ = self._old_clone(*args, **kwargs)
        t_.dist_layout = copy.deepcopy(self.dist_layout)
        return t_

    # bind the new methods to the tensor
    dtensor.detach = new_detach.__get__(dtensor)
    dtensor.clone = new_clone.__get__(dtensor)
    return dtensor


def _construct_default_sharding_spec(tensor: torch.Tensor,) -> ShardingSpec:
    '''
    Construct the default sharding specification for the tensor.

    Args:
        tensor (`torch.Tensor`): the tensor to be sharded.

    Returns:
        A `ShardingSpec` object without any sharding specified.
    '''
    return ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={})


def _apply_layout(tensor, layout):
    '''
    Apply the layout to the local tensor during initializing process.
    '''
    # layout converter requires a source and target laytout
    # we construct the source layer for an unsharded tensor
    # and use self.dist_layer as the targer layout for the sharded tensor
    source_spec = _construct_default_sharding_spec(tensor)
    source_layout = Layout(device_mesh=layout.device_mesh, sharding_spec=source_spec, global_shape=tensor.shape)
    sharded_tensor = layout_converter.apply(tensor=tensor, source_layout=source_layout, target_layout=layout)
    return sharded_tensor


def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> torch.Tensor:
    """
    Convert the given tensor to a distributed tensor.

    Args:
        tensor (torch.Tensor): The tensor to be converted.
        device_mesh (DeviceMesh): The device mesh for abstraction of the compute devices.
        sharding_spec (ShardingSpec): The sharding specification which describes how the tensor will be sharded.

    Returns:
        torch.Tensor: The distributed tensor.
    """
    assert not is_distributed_tensor(tensor), 'The input tensor is already a distributed tensor.'
    dist_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=tensor.shape)

    # shard tensor
    sharded_tensor = _apply_layout(tensor, dist_layout)

    # hack some tensor methods
    _hijack_detach_and_clone(sharded_tensor)

    return sharded_tensor


def redistribute(dtensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> None:
    '''
    Convert the layout of the tensor from source_spec to target_spec.
    This will update the `local_tensor` and `dist_layout` in place.

    Args:
        dtensor (torch.Tensor): the distributed tensor to be converted.
        device_mesh (DeviceMesh): the device mesh for abstraction of the compute devices.
        target_layout (Layout): the target layout specification.
    '''
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    global_shape = get_global_shape(dtensor)
    target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=global_shape)
    resharded_tensor = layout_converter.apply(tensor=dtensor,
                                              source_layout=dtensor.dist_layout,
                                              target_layout=target_layout)
    return resharded_tensor


def to_global(dtensor: torch.Tensor) -> torch.Tensor:
    """
    Convert a distributed tensor to the global tensor with the given layout.
    This function returns a native `torch.Tensor` object.

    Args:
        dtensor (torch.Tensor): the distributed tensor to be converted.

    Returns:
        torch.Tensor: the global tensor.
    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    layout_converter = LayoutConverter()

    global_sharding_spec = ShardingSpec(dtensor.dim(), {})
    device_mesh = get_device_mesh(dtensor)
    global_shape = get_global_shape(dtensor)
    global_layout = Layout(device_mesh=device_mesh, sharding_spec=global_sharding_spec, global_shape=global_shape)

    global_tensor = layout_converter.apply(dtensor, dtensor.dist_layout, global_layout)
    return global_tensor


def shard_rowwise(
    tensor: torch.Tensor,
    group_or_device_mesh: Union[ProcessGroup, DeviceMesh] = None,
) -> torch.Tensor:
    """
    Shard the first dim of the given tensor.

    Args:
        tensor (torch.Tensor): The tensor to be sharded.
        group_or_device_mesh (Union[ProcessGroup, DeviceMesh], optional): The group or device mesh to shard the tensor.
            If None, the tensor will be sharded with respect to the global process group.
            Defaults to None.
        inplace (bool, optional): Whether to shard the tensor in-place. Defaults to False.

    Returns:
        torch.Tensor: The sharded tensor.
    """
    # if the group_or_device_mesh is None, we shard the tensor with respect to the global process group
    if group_or_device_mesh is None:
        group_or_device_mesh = dist.GroupMember.WORLD

    if isinstance(group_or_device_mesh, ProcessGroup):
        device_mesh = DeviceMesh.from_process_group(group_or_device_mesh)
    else:
        assert len(group_or_device_mesh.shape) == 1, 'Only 1D DeviceMesh is accepted for row-wise sharding.'
        device_mesh = group_or_device_mesh

    sharding_spec = ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={0: [0]})

    return distribute_tensor(tensor, device_mesh, sharding_spec)


def shard_colwise(tensor: torch.Tensor, group_or_device_mesh: Union[ProcessGroup, DeviceMesh] = None) -> torch.Tensor:
    """
    Shard the first dim of the given tensor.

    Args:
        tensor (torch.Tensor): The tensor to be sharded.
        group_or_device_mesh (Union[ProcessGroup, DeviceMesh], optional): The group or device mesh to shard the tensor.
            If None, the tensor will be sharded with respect to the global process group.
            Defaults to None.
        inplace (bool, optional): Whether to shard the tensor in-place. Defaults to False.

    Returns:
        torch.Tensor: The sharded tensor.
    """
    # if the group_or_device_mesh is None, we shard the tensor with respect to the global process group
    if group_or_device_mesh is None:
        group_or_device_mesh = dist.GroupMember.WORLD

    if isinstance(group_or_device_mesh, ProcessGroup):
        device_mesh = DeviceMesh.from_process_group(group_or_device_mesh)
    else:
        assert len(group_or_device_mesh.shape) == 1, 'Only 1D DeviceMesh is accepted for row-wise sharding.'
        device_mesh = group_or_device_mesh
    sharding_spec = ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={-1: [0]})

    return distribute_tensor(tensor, device_mesh, sharding_spec)


def sharded_tensor_to_param(dtensor: torch.Tensor, requires_grad: bool = True):
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    param = torch.nn.Parameter(dtensor, requires_grad=requires_grad)

    # make it distributed as well
    param.dist_layout = dtensor.dist_layout
    _hijack_detach_and_clone(param)

    return param


def sharded_tensor_to_existing_param(dtensor: torch.Tensor, param: torch.nn.Parameter) -> None:
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    param.data = dtensor
    # make it distributed as well
    param.dist_layout = dtensor.dist_layout
    _hijack_detach_and_clone(param)


def compute_global_numel(dtensor: torch.Tensor) -> int:
    """
    Compute the global number of elements in the distributed tensor.

    Args:
        dtensor (torch.Tensor): The distributed tensor.

    Returns:
        int: The global number of elements in the distributed tensor.
    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    numel = reduce(operator.mul, dtensor.dist_layout.global_shape)
    return numel


def get_layout(dtensor: torch.Tensor) -> Layout:
    """
    Get the layout of the distributed tensor.

    Args:
        dtensor (torch.Tensor): The distributed tensor.

    Returns:
        Layout: The layout of the distributed tensor.

    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    return dtensor.dist_layout


def get_global_shape(dtensor: torch.Tensor) -> torch.Size:
    """
    Get the global shape of the distributed tensor.

    Args:
        dtensor (torch.Tensor): The distributed tensor.

    Returns:
        torch.Size: The global shape of the distributed tensor.
    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    return dtensor.dist_layout.global_shape


def get_device_mesh(dtensor: torch.Tensor) -> DeviceMesh:
    """
    Get the device mesh of the distributed tensor.

    Args:
        dtensor (torch.Tensor): The distributed tensor.

    Returns:
        DeviceMesh: The device mesh of the distributed tensor.
    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    return dtensor.dist_layout.device_mesh


def get_sharding_spec(dtensor: torch.Tensor) -> ShardingSpec:
    """
    Get the sharding spec of the distributed tensor.

    Args:
        dtensor (torch.Tensor): The distributed tensor.

    Returns:
        ShardingSpec: The sharding spec of the distributed tensor.
    """
    assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'
    return dtensor.dist_layout.sharding_spec


# ======================================================
# Some sharding does not obey the SPMD style
# e.g. Fused QKV layer in GPT2
# we support customize sharding with the following APIs
# ======================================================
def is_customized_distributed_tensor(tensor: torch.Tensor):
    """
    Check whether the given tensor is a customized distributed tensor.

    Args:
        tensor (torch.Tensor): The tensor to be checked.

    Returns:
        bool: Whether the given tensor is a customized distributed tensor.
    """
    return hasattr(tensor, 'shard_fn') and hasattr(tensor, 'gather_fn')


def _hijack_detach_and_clone_for_customized_distributed_tensor(dtensor: torch.Tensor) -> torch.Tensor:
    """
    Hijack the detach and clone methods of the tensor to make sure the dist_layout is copied.

    Args:
        tensor (torch.Tensor): The tensor to be hijacked.

    Returns:
        torch.Tensor: The hijacked tensor.
    """
    dtensor._old_detach = dtensor.detach
    dtensor._old_clone = dtensor.clone

    def new_detach(self):
        t_ = self._old_detach()
        t_.shard_fn = self.shard_fn
        t_.gather_fn = self.gather_fn
        return t_

    def new_clone(self, *args, **kwargs):
        t_ = self._old_clone(*args, **kwargs)
        t_.shard_fn = self.shard_fn
        t_.gather_fn = self.gather_fn
        return t_

    # bind the new methods to the tensor
    dtensor.detach = new_detach.__get__(dtensor)
    dtensor.clone = new_clone.__get__(dtensor)
    return dtensor


def distribute_tensor_with_customization(tensor: torch.Tensor, shard_fn, gather_fn: callable):
    """
    Distribute the given tensor with the given shard_fn and gather_fn.

    Example:

    ```python
    # define shard and gather functions
    def shard_fn(tensor):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        return tensor.chunk(world_size, dim=0)[rank]

    def gather_fn(tensor):
        rank = torch.distributed.get_rank()
        world_size = torch.distributed.get_world_size()
        shard_list = [torch.zeros_like(tensor) for _ in range(world_size)]
        torch.distributed.all_gather(shard_list, tensor)
        return torch.cat(shard_list, dim=0)

    # create a distributed tensor
    tensor = torch.rand(4, 4)
    dtensor = distribute_tensor_with_customization(tensor, shard_fn, gather_fn)
    ```

    Args:
        tensor (torch.Tensor): The tensor to be distributed.
        shard_fn (callable): The function to shard the tensor.
        gather_fn (callable): The function to gather the tensor.

    Returns:
        torch.Tensor: The distributed tensor.
    """
    assert callable(shard_fn), 'The shard_fn must be callable.'
    assert callable(gather_fn), 'The gather_fn must be callable.'
    assert not is_distributed_tensor(tensor), 'The input tensor is already a distributed tensor.'

    sharded_tensor = shard_fn(tensor)

    # set the shard_fn and gather_fn as attributes of the distributed tensor
    sharded_tensor.shard_fn = shard_fn
    sharded_tensor.gather_fn = gather_fn

    # set the shard_fn and gather_fn as attributes of the distributed tensor
    _hijack_detach_and_clone_for_customized_distributed_tensor(sharded_tensor)

    return sharded_tensor


def to_global_for_customized_distributed_tensor(dtensor: torch.Tensor) -> torch.Tensor:
    """
    Gather the given tensor to the global tensor.

    Args:
        dtensor (torch.Tensor): The distributed tensor.

    Returns:
        torch.Tensor: The global tensor.
    """
    assert is_customized_distributed_tensor(dtensor), 'The input tensor is not a customized distributed tensor.'
    return dtensor.gather_fn(dtensor)


def customized_distributed_tensor_to_param(dtensor: torch.Tensor, requires_grad: bool = True):
    """
    Convert the given customized distributed tensor to a parameter.
    """
    assert is_customized_distributed_tensor(dtensor), 'The input tensor is not a customized distributed tensor.'

    param = torch.nn.Parameter(dtensor, requires_grad=requires_grad)

    # make it distributed as well
    param.shard_fn = dtensor.shard_fn
    param.gather_fn = dtensor.gather_fn
    _hijack_detach_and_clone_for_customized_distributed_tensor(param)
    return param


def customized_distributed_tensor_to_existing_param(dtensor: torch.Tensor, param: torch.nn.Parameter):
    """
    Convert the given customized distributed tensor to an existing parameter.
    """
    assert is_customized_distributed_tensor(dtensor), 'The input tensor is not a customized distributed tensor.'

    param.data = dtensor.data
    param.shard_fn = dtensor.shard_fn
    param.gather_fn = dtensor.gather_fn
    _hijack_detach_and_clone_for_customized_distributed_tensor(param)
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`import copy`
			`import operator`
			`from functools import reduce`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00			`from typing import Union`

			`import torch`
			`import torch.distributed as dist`
			`from torch.distributed import ProcessGroup`

			`from colossalai.device.device_mesh import DeviceMesh`

[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`from .layout import Layout`
			`from .layout_converter import LayoutConverter`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00			`from .sharding_spec import ShardingSpec`

[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`layout_converter = LayoutConverter()`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00
[pipeline] support fp32 for HybridPlugin/merge shardformer test and pipeline test into one file (#4354) * add naive optimizer for 3DPlugin/refactor gpt2 shardformer test * merge tests of PP/DP/TP combinations into one test file * fix bug when sync grad for dp in HybridPlugin * update supported precisions for 3DPlugin/fix bug when shifting tp_degree * improve the passing of lazy_init * modify lazy_init/use sync_shared_params 2023-08-01 09:29:09 +00:00			`def clear_layout_converter():`
			`global layout_converter`
			`layout_converter.cached_solution.clear()`


[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`def is_distributed_tensor(tensor: torch.Tensor) -> bool:`
			`"""`
			`Check whether the given tensor is a distributed tensor.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be checked.`

			`Returns:`
			`bool: Whether the given tensor is a distributed tensor.`
			`"""`
			`return hasattr(tensor, "dist_layout")`


			`def is_sharded(dtensor: torch.Tensor) -> bool:`
			`"""`
			`Check if a tensor is sharded.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be checked.`

			`Returns:`
			`bool: True if the tensor is sharded, False otherwise.`
			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`return list(dtensor.shape) == list(dtensor.dist_layout.global_shape)`


			`def _hijack_detach_and_clone(dtensor: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Hijack the detach and clone methods of the tensor to make sure the dist_layout is copied.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be hijacked.`

			`Returns:`
			`torch.Tensor: The hijacked tensor.`
			`"""`
			`dtensor._old_detach = dtensor.detach`
			`dtensor._old_clone = dtensor.clone`

			`def new_detach(self):`
			`t_ = self._old_detach()`
			`t_.dist_layout = copy.deepcopy(self.dist_layout)`
			`return t_`

			`def new_clone(self, args, *kwargs):`
			`t_ = self._old_clone(args, *kwargs)`
			`t_.dist_layout = copy.deepcopy(self.dist_layout)`
			`return t_`

			`# bind the new methods to the tensor`
			`dtensor.detach = new_detach.__get__(dtensor)`
			`dtensor.clone = new_clone.__get__(dtensor)`
			`return dtensor`


			`def _construct_default_sharding_spec(tensor: torch.Tensor,) -> ShardingSpec:`
			`'''`
			`Construct the default sharding specification for the tensor.`

			`Args:`
			tensor (`torch.Tensor`): the tensor to be sharded.

			`Returns:`
			A `ShardingSpec` object without any sharding specified.
			`'''`
			`return ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={})`


			`def _apply_layout(tensor, layout):`
			`'''`
			`Apply the layout to the local tensor during initializing process.`
			`'''`
			`# layout converter requires a source and target laytout`
			`# we construct the source layer for an unsharded tensor`
			`# and use self.dist_layer as the targer layout for the sharded tensor`
			`source_spec = _construct_default_sharding_spec(tensor)`
			`source_layout = Layout(device_mesh=layout.device_mesh, sharding_spec=source_spec, global_shape=tensor.shape)`
			`sharded_tensor = layout_converter.apply(tensor=tensor, source_layout=source_layout, target_layout=layout)`
			`return sharded_tensor`


			`def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> torch.Tensor:`
			`"""`
			`Convert the given tensor to a distributed tensor.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be converted.`
			`device_mesh (DeviceMesh): The device mesh for abstraction of the compute devices.`
			`sharding_spec (ShardingSpec): The sharding specification which describes how the tensor will be sharded.`

			`Returns:`
			`torch.Tensor: The distributed tensor.`
			`"""`
			`assert not is_distributed_tensor(tensor), 'The input tensor is already a distributed tensor.'`
			`dist_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=tensor.shape)`

			`# shard tensor`
			`sharded_tensor = _apply_layout(tensor, dist_layout)`

			`# hack some tensor methods`
			`_hijack_detach_and_clone(sharded_tensor)`

			`return sharded_tensor`


			`def redistribute(dtensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> None:`
			`'''`
			`Convert the layout of the tensor from source_spec to target_spec.`
			This will update the `local_tensor` and `dist_layout` in place.

			`Args:`
			`dtensor (torch.Tensor): the distributed tensor to be converted.`
			`device_mesh (DeviceMesh): the device mesh for abstraction of the compute devices.`
			`target_layout (Layout): the target layout specification.`
			`'''`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`global_shape = get_global_shape(dtensor)`
			`target_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=global_shape)`
			`resharded_tensor = layout_converter.apply(tensor=dtensor,`
			`source_layout=dtensor.dist_layout,`
			`target_layout=target_layout)`
			`return resharded_tensor`


			`def to_global(dtensor: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Convert a distributed tensor to the global tensor with the given layout.`
			This function returns a native `torch.Tensor` object.

			`Args:`
			`dtensor (torch.Tensor): the distributed tensor to be converted.`

			`Returns:`
			`torch.Tensor: the global tensor.`
			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`layout_converter = LayoutConverter()`

			`global_sharding_spec = ShardingSpec(dtensor.dim(), {})`
			`device_mesh = get_device_mesh(dtensor)`
			`global_shape = get_global_shape(dtensor)`
			`global_layout = Layout(device_mesh=device_mesh, sharding_spec=global_sharding_spec, global_shape=global_shape)`

			`global_tensor = layout_converter.apply(dtensor, dtensor.dist_layout, global_layout)`
			`return global_tensor`


			`def shard_rowwise(`
			`tensor: torch.Tensor,`
			`group_or_device_mesh: Union[ProcessGroup, DeviceMesh] = None,`
			`) -> torch.Tensor:`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00			`"""`
[shardformer] removed inplace tensor sharding (#4018) 2023-06-16 07:58:27 +00:00			`Shard the first dim of the given tensor.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be sharded.`
			`group_or_device_mesh (Union[ProcessGroup, DeviceMesh], optional): The group or device mesh to shard the tensor.`
			`If None, the tensor will be sharded with respect to the global process group.`
			`Defaults to None.`
			`inplace (bool, optional): Whether to shard the tensor in-place. Defaults to False.`

			`Returns:`
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`torch.Tensor: The sharded tensor.`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00			`"""`
			`# if the group_or_device_mesh is None, we shard the tensor with respect to the global process group`
			`if group_or_device_mesh is None:`
			`group_or_device_mesh = dist.GroupMember.WORLD`

			`if isinstance(group_or_device_mesh, ProcessGroup):`
			`device_mesh = DeviceMesh.from_process_group(group_or_device_mesh)`
			`else:`
			`assert len(group_or_device_mesh.shape) == 1, 'Only 1D DeviceMesh is accepted for row-wise sharding.'`
			`device_mesh = group_or_device_mesh`
[shardformer] removed inplace tensor sharding (#4018) 2023-06-16 07:58:27 +00:00
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`sharding_spec = ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={0: [0]})`
[shardformer] removed inplace tensor sharding (#4018) 2023-06-16 07:58:27 +00:00
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`return distribute_tensor(tensor, device_mesh, sharding_spec)`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00

[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`def shard_colwise(tensor: torch.Tensor, group_or_device_mesh: Union[ProcessGroup, DeviceMesh] = None) -> torch.Tensor:`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00			`"""`
[shardformer] removed inplace tensor sharding (#4018) 2023-06-16 07:58:27 +00:00			`Shard the first dim of the given tensor.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be sharded.`
			`group_or_device_mesh (Union[ProcessGroup, DeviceMesh], optional): The group or device mesh to shard the tensor.`
			`If None, the tensor will be sharded with respect to the global process group.`
			`Defaults to None.`
			`inplace (bool, optional): Whether to shard the tensor in-place. Defaults to False.`

			`Returns:`
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`torch.Tensor: The sharded tensor.`
[shardformer] integrated linear 1D with dtensor (#3996) * [shardformer] integrated linear 1D with dtensor * polish code 2023-06-15 10:03:38 +00:00			`"""`
			`# if the group_or_device_mesh is None, we shard the tensor with respect to the global process group`
			`if group_or_device_mesh is None:`
			`group_or_device_mesh = dist.GroupMember.WORLD`

			`if isinstance(group_or_device_mesh, ProcessGroup):`
			`device_mesh = DeviceMesh.from_process_group(group_or_device_mesh)`
			`else:`
			`assert len(group_or_device_mesh.shape) == 1, 'Only 1D DeviceMesh is accepted for row-wise sharding.'`
			`device_mesh = group_or_device_mesh`
			`sharding_spec = ShardingSpec(dim_size=tensor.dim(), dim_partition_dict={-1: [0]})`
[shardformer] removed inplace tensor sharding (#4018) 2023-06-16 07:58:27 +00:00
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`return distribute_tensor(tensor, device_mesh, sharding_spec)`


			`def sharded_tensor_to_param(dtensor: torch.Tensor, requires_grad: bool = True):`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`param = torch.nn.Parameter(dtensor, requires_grad=requires_grad)`

			`# make it distributed as well`
			`param.dist_layout = dtensor.dist_layout`
			`_hijack_detach_and_clone(param)`

			`return param`

[shardformer] removed inplace tensor sharding (#4018) 2023-06-16 07:58:27 +00:00
[shardformer] support inplace sharding (#4251) * [shardformer] embedding support inplace sharding * [shardformer] linear support inplace sharding * [shardformer] layernorm support inplace sharding * [shardformer] qkv support inplace sharding * [test] update shardformer layer test * [shardformer] fix shared param sharding * [shardformer] fix bert policy * [shardformer] fix bloom policy * [shardformer] fix llama policy * [shardformer] fix opt policy * [shardformer] fix t5 policy * [shardformer] fix fused qkv linear * [shardformer] fix bugs * force sync * [test] fix bugs * [test] fix transformer version 2023-07-20 02:39:06 +00:00			`def sharded_tensor_to_existing_param(dtensor: torch.Tensor, param: torch.nn.Parameter) -> None:`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`param.data = dtensor`
			`# make it distributed as well`
			`param.dist_layout = dtensor.dist_layout`
			`_hijack_detach_and_clone(param)`


[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`def compute_global_numel(dtensor: torch.Tensor) -> int:`
			`"""`
			`Compute the global number of elements in the distributed tensor.`

			`Args:`
			`dtensor (torch.Tensor): The distributed tensor.`

			`Returns:`
			`int: The global number of elements in the distributed tensor.`
			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`numel = reduce(operator.mul, dtensor.dist_layout.global_shape)`
			`return numel`


			`def get_layout(dtensor: torch.Tensor) -> Layout:`
			`"""`
			`Get the layout of the distributed tensor.`

			`Args:`
			`dtensor (torch.Tensor): The distributed tensor.`

			`Returns:`
			`Layout: The layout of the distributed tensor.`

			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`return dtensor.dist_layout`


			`def get_global_shape(dtensor: torch.Tensor) -> torch.Size:`
			`"""`
			`Get the global shape of the distributed tensor.`

			`Args:`
			`dtensor (torch.Tensor): The distributed tensor.`

			`Returns:`
			`torch.Size: The global shape of the distributed tensor.`
			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`return dtensor.dist_layout.global_shape`


			`def get_device_mesh(dtensor: torch.Tensor) -> DeviceMesh:`
			`"""`
			`Get the device mesh of the distributed tensor.`

			`Args:`
			`dtensor (torch.Tensor): The distributed tensor.`

			`Returns:`
			`DeviceMesh: The device mesh of the distributed tensor.`
			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`return dtensor.dist_layout.device_mesh`


			`def get_sharding_spec(dtensor: torch.Tensor) -> ShardingSpec:`
			`"""`
			`Get the sharding spec of the distributed tensor.`

			`Args:`
			`dtensor (torch.Tensor): The distributed tensor.`

			`Returns:`
			`ShardingSpec: The sharding spec of the distributed tensor.`
			`"""`
			`assert is_distributed_tensor(dtensor), 'The input tensor is not a distributed tensor.'`
			`return dtensor.dist_layout.sharding_spec`
[shardformer] supported fused qkv checkpoint (#4073) 2023-06-23 08:07:09 +00:00

			`# ======================================================`
			`# Some sharding does not obey the SPMD style`
			`# e.g. Fused QKV layer in GPT2`
			`# we support customize sharding with the following APIs`
			`# ======================================================`
			`def is_customized_distributed_tensor(tensor: torch.Tensor):`
			`"""`
			`Check whether the given tensor is a customized distributed tensor.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be checked.`

			`Returns:`
			`bool: Whether the given tensor is a customized distributed tensor.`
			`"""`
			`return hasattr(tensor, 'shard_fn') and hasattr(tensor, 'gather_fn')`


			`def _hijack_detach_and_clone_for_customized_distributed_tensor(dtensor: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Hijack the detach and clone methods of the tensor to make sure the dist_layout is copied.`

			`Args:`
			`tensor (torch.Tensor): The tensor to be hijacked.`

			`Returns:`
			`torch.Tensor: The hijacked tensor.`
			`"""`
			`dtensor._old_detach = dtensor.detach`
			`dtensor._old_clone = dtensor.clone`

			`def new_detach(self):`
			`t_ = self._old_detach()`
			`t_.shard_fn = self.shard_fn`
			`t_.gather_fn = self.gather_fn`
			`return t_`

			`def new_clone(self, args, *kwargs):`
			`t_ = self._old_clone(args, *kwargs)`
			`t_.shard_fn = self.shard_fn`
			`t_.gather_fn = self.gather_fn`
			`return t_`

			`# bind the new methods to the tensor`
			`dtensor.detach = new_detach.__get__(dtensor)`
			`dtensor.clone = new_clone.__get__(dtensor)`
			`return dtensor`


			`def distribute_tensor_with_customization(tensor: torch.Tensor, shard_fn, gather_fn: callable):`
			`"""`
			`Distribute the given tensor with the given shard_fn and gather_fn.`

			`Example:`

			```python
			`# define shard and gather functions`
			`def shard_fn(tensor):`
			`rank = torch.distributed.get_rank()`
			`world_size = torch.distributed.get_world_size()`
			`return tensor.chunk(world_size, dim=0)[rank]`

			`def gather_fn(tensor):`
			`rank = torch.distributed.get_rank()`
			`world_size = torch.distributed.get_world_size()`
			`shard_list = [torch.zeros_like(tensor) for _ in range(world_size)]`
			`torch.distributed.all_gather(shard_list, tensor)`
			`return torch.cat(shard_list, dim=0)`

			`# create a distributed tensor`
			`tensor = torch.rand(4, 4)`
			`dtensor = distribute_tensor_with_customization(tensor, shard_fn, gather_fn)`
			```

			`Args:`
			`tensor (torch.Tensor): The tensor to be distributed.`
			`shard_fn (callable): The function to shard the tensor.`
			`gather_fn (callable): The function to gather the tensor.`

			`Returns:`
			`torch.Tensor: The distributed tensor.`
			`"""`
			`assert callable(shard_fn), 'The shard_fn must be callable.'`
			`assert callable(gather_fn), 'The gather_fn must be callable.'`
			`assert not is_distributed_tensor(tensor), 'The input tensor is already a distributed tensor.'`

			`sharded_tensor = shard_fn(tensor)`

			`# set the shard_fn and gather_fn as attributes of the distributed tensor`
			`sharded_tensor.shard_fn = shard_fn`
			`sharded_tensor.gather_fn = gather_fn`

			`# set the shard_fn and gather_fn as attributes of the distributed tensor`
			`_hijack_detach_and_clone_for_customized_distributed_tensor(sharded_tensor)`

			`return sharded_tensor`


			`def to_global_for_customized_distributed_tensor(dtensor: torch.Tensor) -> torch.Tensor:`
			`"""`
			`Gather the given tensor to the global tensor.`

			`Args:`
			`dtensor (torch.Tensor): The distributed tensor.`

			`Returns:`
			`torch.Tensor: The global tensor.`
			`"""`
			`assert is_customized_distributed_tensor(dtensor), 'The input tensor is not a customized distributed tensor.'`
			`return dtensor.gather_fn(dtensor)`


			`def customized_distributed_tensor_to_param(dtensor: torch.Tensor, requires_grad: bool = True):`
			`"""`
			`Convert the given customized distributed tensor to a parameter.`
			`"""`
			`assert is_customized_distributed_tensor(dtensor), 'The input tensor is not a customized distributed tensor.'`

			`param = torch.nn.Parameter(dtensor, requires_grad=requires_grad)`

			`# make it distributed as well`
			`param.shard_fn = dtensor.shard_fn`
			`param.gather_fn = dtensor.gather_fn`
			`_hijack_detach_and_clone_for_customized_distributed_tensor(param)`
			`return param`
[shardformer] support inplace sharding (#4251) * [shardformer] embedding support inplace sharding * [shardformer] linear support inplace sharding * [shardformer] layernorm support inplace sharding * [shardformer] qkv support inplace sharding * [test] update shardformer layer test * [shardformer] fix shared param sharding * [shardformer] fix bert policy * [shardformer] fix bloom policy * [shardformer] fix llama policy * [shardformer] fix opt policy * [shardformer] fix t5 policy * [shardformer] fix fused qkv linear * [shardformer] fix bugs * force sync * [test] fix bugs * [test] fix transformer version 2023-07-20 02:39:06 +00:00

			`def customized_distributed_tensor_to_existing_param(dtensor: torch.Tensor, param: torch.nn.Parameter):`
			`"""`
			`Convert the given customized distributed tensor to an existing parameter.`
			`"""`
			`assert is_customized_distributed_tensor(dtensor), 'The input tensor is not a customized distributed tensor.'`

			`param.data = dtensor.data`
			`param.shard_fn = dtensor.shard_fn`
			`param.gather_fn = dtensor.gather_fn`
			`_hijack_detach_and_clone_for_customized_distributed_tensor(param)`