2022-10-14 05:27:00 +00:00
|
|
|
import operator
|
|
|
|
from copy import deepcopy
|
|
|
|
from functools import reduce
|
2022-10-12 03:16:18 +00:00
|
|
|
from typing import Dict
|
2022-10-14 05:27:00 +00:00
|
|
|
|
|
|
|
import torch
|
|
|
|
|
2022-10-12 03:16:18 +00:00
|
|
|
from colossalai.tensor.sharding_spec import ShardingSpec
|
2022-10-14 05:27:00 +00:00
|
|
|
|
|
|
|
__all__ = [
|
2023-09-19 06:20:26 +00:00
|
|
|
"transpose_partition_dim",
|
|
|
|
"update_partition_dim",
|
|
|
|
"enumerate_all_possible_1d_sharding",
|
|
|
|
"enumerate_all_possible_2d_sharding",
|
|
|
|
"generate_sharding_size",
|
2022-10-14 05:27:00 +00:00
|
|
|
]
|
2022-10-12 03:16:18 +00:00
|
|
|
|
|
|
|
|
2022-10-20 08:12:39 +00:00
|
|
|
def transpose_partition_dim(sharding_spec: ShardingSpec, dim1: int, dim2: int) -> ShardingSpec:
|
2022-10-12 03:16:18 +00:00
|
|
|
"""
|
|
|
|
Switch the sharding mesh dimensions for two tensor dimensions. This operation is in-place.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
sharding_spec (ShardingSpec): the sharding spec for which partition dim are switched
|
|
|
|
dim1 (int): the tensor dimension to switch
|
|
|
|
dim2 (int): the tensor dimension to switch
|
|
|
|
"""
|
2023-09-19 06:20:26 +00:00
|
|
|
assert len(sharding_spec.entire_shape) >= 2, "The entire_shape of the sharding spec must have at least 2 dimensions"
|
2022-10-12 03:16:18 +00:00
|
|
|
dim_partition_dict = sharding_spec.dim_partition_dict
|
2022-10-20 07:18:16 +00:00
|
|
|
|
|
|
|
# transpose the dim partition
|
2022-10-12 03:16:18 +00:00
|
|
|
dim1_partition = dim_partition_dict.pop(dim1, None)
|
|
|
|
dim2_partition = dim_partition_dict.pop(dim2, None)
|
|
|
|
|
|
|
|
if dim1_partition:
|
|
|
|
dim_partition_dict[dim2] = dim1_partition
|
|
|
|
if dim2_partition:
|
|
|
|
dim_partition_dict[dim1] = dim2_partition
|
|
|
|
|
2022-10-20 07:18:16 +00:00
|
|
|
# get the transposed shape
|
|
|
|
new_shape = list(sharding_spec.entire_shape[:])
|
|
|
|
new_shape[dim2], new_shape[dim1] = new_shape[dim1], new_shape[dim2]
|
|
|
|
new_shape = torch.Size(new_shape)
|
|
|
|
|
2022-10-12 03:16:18 +00:00
|
|
|
# re-init the sharding spec
|
2022-10-20 07:18:16 +00:00
|
|
|
sharding_spec.__init__(sharding_spec.device_mesh, new_shape, dim_partition_dict)
|
2022-10-12 03:16:18 +00:00
|
|
|
return sharding_spec
|
|
|
|
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
def update_partition_dim(
|
|
|
|
sharding_spec: ShardingSpec, dim_mapping: Dict[int, int], physical_shape: torch.Size, inplace: bool = False
|
|
|
|
):
|
2022-10-12 03:16:18 +00:00
|
|
|
"""
|
|
|
|
This method is used to update the partition dim dict from the logical one to the physical one.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
sharding_spec (ShardingSpec): the sharding spec for which partition dims are updated
|
|
|
|
dim_mapping (Dict[int, int]): the mapping from the logical tensor dimension to the physical tensor dimension
|
|
|
|
physical_shape (torch.Size): the physical shape for the tensor
|
|
|
|
"""
|
|
|
|
|
|
|
|
if inplace:
|
|
|
|
current_sharding_spec = sharding_spec
|
|
|
|
else:
|
|
|
|
current_sharding_spec = deepcopy(sharding_spec)
|
|
|
|
|
|
|
|
old_dim_partition_dict = current_sharding_spec.dim_partition_dict
|
|
|
|
new_dim_partition_dict = {}
|
|
|
|
|
|
|
|
# assign new dim
|
|
|
|
for old_dim, new_dim in dim_mapping.items():
|
|
|
|
mesh_dims = old_dim_partition_dict.pop(old_dim)
|
|
|
|
new_dim_partition_dict[new_dim] = mesh_dims
|
|
|
|
|
|
|
|
for tensor_dim, mesh_dims in old_dim_partition_dict.items():
|
|
|
|
if tensor_dim in new_dim_partition_dict:
|
|
|
|
raise KeyError(f"There are duplicated entries for the tensor sharding dimension {tensor_dim}")
|
|
|
|
else:
|
|
|
|
new_dim_partition_dict[tensor_dim] = mesh_dims
|
|
|
|
|
|
|
|
# update sharding spec
|
2023-09-19 06:20:26 +00:00
|
|
|
current_sharding_spec.__init__(
|
|
|
|
device_mesh=sharding_spec.device_mesh, entire_shape=physical_shape, dim_partition_dict=new_dim_partition_dict
|
|
|
|
)
|
2022-10-12 03:16:18 +00:00
|
|
|
return current_sharding_spec
|
2022-10-14 05:27:00 +00:00
|
|
|
|
|
|
|
|
|
|
|
def enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size):
|
|
|
|
dim_partition_list = []
|
|
|
|
# enumerate all the 2D sharding cases
|
|
|
|
for i in range(dim_size):
|
|
|
|
for j in range(i + 1, dim_size):
|
|
|
|
dim_partition_dict_0 = {i: [mesh_dim_0], j: [mesh_dim_1]}
|
|
|
|
dim_partition_dict_1 = {i: [mesh_dim_1], j: [mesh_dim_0]}
|
|
|
|
dim_partition_list.append(dim_partition_dict_0)
|
|
|
|
dim_partition_list.append(dim_partition_dict_1)
|
|
|
|
for i in range(dim_size):
|
|
|
|
dim_partition_dict_flatten = {i: [mesh_dim_0, mesh_dim_1]}
|
|
|
|
dim_partition_list.append(dim_partition_dict_flatten)
|
|
|
|
|
|
|
|
return dim_partition_list
|
|
|
|
|
|
|
|
|
|
|
|
def enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size):
|
|
|
|
dim_partition_list = []
|
|
|
|
# enumerate all the 1D sharding cases
|
|
|
|
for i in range(dim_size):
|
|
|
|
dim_partition_dict_0 = {i: [mesh_dim_0]}
|
|
|
|
dim_partition_list.append(dim_partition_dict_0)
|
|
|
|
|
|
|
|
return dim_partition_list
|
|
|
|
|
|
|
|
|
|
|
|
def generate_sharding_size(dim_partition_dict, device_mesh):
|
|
|
|
total_sharding_size = 1
|
|
|
|
for mesh_dim_list in dim_partition_dict.values():
|
|
|
|
mesh_dim_sharding_size = [device_mesh.shape[mesh_dim] for mesh_dim in mesh_dim_list]
|
|
|
|
sharding_size = reduce(operator.mul, mesh_dim_sharding_size)
|
|
|
|
total_sharding_size *= sharding_size
|
|
|
|
|
|
|
|
return total_sharding_size
|