mirror of https://github.com/hpcaitech/ColossalAI
365 lines
14 KiB
Python
365 lines
14 KiB
Python
![]() |
from cmath import log
|
||
|
from distutils.log import Log
|
||
|
import operator
|
||
|
import torch
|
||
|
from functools import reduce
|
||
|
from ..sharding_strategy import ShardingStrategy_V2, TrainCycleItem, MemoryCost
|
||
|
from colossalai.tensor.shape_consistency import CollectiveCommPattern
|
||
|
from .strategy_generator import StrategyGenerator_V2
|
||
|
from typing import List
|
||
|
|
||
|
|
||
|
class DotProductStrategyGenerator(StrategyGenerator_V2):
|
||
|
"""TODO: to be implemented"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
class MatVecStrategyGenerator(StrategyGenerator_V2):
|
||
|
"""TODO: to be implemented"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
class LinearProjectionStrategyGenerator(StrategyGenerator_V2):
|
||
|
|
||
|
def update_compute_cost(self, strategy: ShardingStrategy_V2) -> ShardingStrategy_V2:
|
||
|
# C = AB
|
||
|
# C: [M, N], A: [M, P], B: [P, N]
|
||
|
# fwd cost = MNP (only count mul)
|
||
|
# bwd: 2 x fwd_cost
|
||
|
sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
|
||
|
sharded_other_shape = strategy.sharding_specs[self.op_data['other']].get_sharded_shape_per_device()
|
||
|
dim_m_val = reduce(operator.mul, sharded_input_shape[:-1])
|
||
|
dim_n_val = sharded_other_shape[-1]
|
||
|
dim_p_val = sharded_other_shape[0]
|
||
|
|
||
|
fwd_compute_cost = dim_m_val * dim_n_val * dim_p_val
|
||
|
bwd_compute_cost = fwd_compute_cost * 2
|
||
|
compute_cost = TrainCycleItem(fwd=bwd_compute_cost,
|
||
|
bwd=bwd_compute_cost,
|
||
|
total=fwd_compute_cost + bwd_compute_cost)
|
||
|
strategy.compute_cost = compute_cost
|
||
|
|
||
|
def update_memory_cost(self, strategy: ShardingStrategy_V2) -> ShardingStrategy_V2:
|
||
|
input_size = self._compute_size_in_bytes(strategy, "input")
|
||
|
other_size = self._compute_size_in_bytes(strategy, "input")
|
||
|
|
||
|
if "bias" in self.op_data:
|
||
|
bias_size = self._compute_size_in_bytes(strategy, "bias")
|
||
|
else:
|
||
|
bias_size = 0
|
||
|
output_size = self._compute_size_in_bytes(strategy, "output")
|
||
|
|
||
|
fwd_mem_cost = MemoryCost(activation=output_size, parameter=other_size + bias_size)
|
||
|
bwd_mem_cost = MemoryCost(activation=input_size + other_size + bias_size, parameter=other_size)
|
||
|
total_mem_cost = MemoryCost(activation=input_size + 2 * output_size + bias_size,
|
||
|
parameter=other_size + bias_size)
|
||
|
memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
|
||
|
strategy.memory_cost = memory_cost
|
||
|
|
||
|
def generate(self) -> List[ShardingStrategy_V2]:
|
||
|
strategies = []
|
||
|
|
||
|
# SS = SR x RS
|
||
|
strategies.append(self.split_lhs_space_rhs_space(0, 1))
|
||
|
strategies.append(self.split_lhs_space_rhs_space(1, 0))
|
||
|
|
||
|
# SR = SS x SR
|
||
|
strategies.append(self.split_lhs_space_both_contract(0, 1))
|
||
|
strategies.append(self.split_lhs_space_both_contract(1, 0))
|
||
|
|
||
|
# RS = RS x SS
|
||
|
strategies.append(self.split_rhs_space_both_contract(0, 1))
|
||
|
strategies.append(self.split_rhs_space_both_contract(1, 0))
|
||
|
|
||
|
# RR= RS x SR
|
||
|
strategies.append(self.recompute_split_both_contract(0))
|
||
|
strategies.append(self.recompute_split_both_contract(1))
|
||
|
|
||
|
# RS = RR x RS
|
||
|
strategies.append(self.split_rhs_space_only(0))
|
||
|
strategies.append(self.split_rhs_space_only(1))
|
||
|
|
||
|
# S01R = S01R x RR
|
||
|
strategies.append(self.split_lhs_1st_dim_1d(0, 1))
|
||
|
|
||
|
# RR = RS01 x S01R
|
||
|
strategies.append(self.split_lhs_2nd_dim_1d(0, 1))
|
||
|
|
||
|
# RS01 = RR x RS01
|
||
|
strategies.append(self.split_rhs_2nd_dim_1d(0, 1))
|
||
|
|
||
|
# update mete info on cost
|
||
|
for strategy in strategies:
|
||
|
self.update_communication_cost(strategy)
|
||
|
self.update_compute_cost(strategy)
|
||
|
self.update_memory_cost(strategy)
|
||
|
|
||
|
return strategies
|
||
|
|
||
|
def split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
|
||
|
# handle case SS = SR x RS
|
||
|
name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}R x RS{mesh_dim_1}'
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {
|
||
|
0: [mesh_dim_0]
|
||
|
},
|
||
|
"other": {
|
||
|
self.dim_q: [mesh_dim_1]
|
||
|
},
|
||
|
"bias": {
|
||
|
-1: [mesh_dim_1]
|
||
|
},
|
||
|
"output": {
|
||
|
0: [mesh_dim_0],
|
||
|
-1: [mesh_dim_1]
|
||
|
},
|
||
|
}
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# set communication action
|
||
|
input_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping["input"],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=mesh_dim_1)
|
||
|
other_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec_mapping["output"],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=mesh_dim_0)
|
||
|
|
||
|
communication_action_mapping = {"input": input_comm_spec, "other": other_comm_spec}
|
||
|
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
|
||
|
# handle the case SR = SS x SR
|
||
|
name = f'S{mesh_dim_0}R = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1}R'
|
||
|
|
||
|
# get sharding spec mapping
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {
|
||
|
0: [mesh_dim_0],
|
||
|
-1: [mesh_dim_1]
|
||
|
},
|
||
|
"other": {
|
||
|
self.dim_p: [mesh_dim_1]
|
||
|
},
|
||
|
"bias": {},
|
||
|
"output": {
|
||
|
0: [mesh_dim_0]
|
||
|
},
|
||
|
}
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication action mapping
|
||
|
input_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping["input"],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=mesh_dim_0)
|
||
|
output_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping["output"],
|
||
|
communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
|
||
|
logical_process_axis=mesh_dim_1)
|
||
|
|
||
|
communication_action_mapping = {"input": input_comm_spec, 'output': output_comm_spec}
|
||
|
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
|
||
|
name = f'RS{mesh_dim_1} = RS{mesh_dim_0} x S{mesh_dim_0}S{mesh_dim_1}'
|
||
|
|
||
|
# get sharding specs
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {
|
||
|
-1: [mesh_dim_0]
|
||
|
},
|
||
|
"other": {
|
||
|
self.dim_p: [mesh_dim_0],
|
||
|
self.dim_q: [mesh_dim_1]
|
||
|
},
|
||
|
"bias": {
|
||
|
-1: [mesh_dim_1]
|
||
|
},
|
||
|
"output": {
|
||
|
-1: [mesh_dim_1]
|
||
|
},
|
||
|
}
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication actions
|
||
|
output_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['output'],
|
||
|
communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
|
||
|
logical_process_axis=mesh_dim_0)
|
||
|
input_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['input'],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=mesh_dim_1)
|
||
|
communication_action_mapping = {"output": output_comm_spec, "input": input_comm_spec}
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def recompute_split_both_contract(self, mesh_dim):
|
||
|
name = f'RR = RS{mesh_dim} x S{mesh_dim}R'
|
||
|
|
||
|
# get sharding spec
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {
|
||
|
-1: [mesh_dim]
|
||
|
},
|
||
|
"other": {
|
||
|
self.dim_p: [mesh_dim]
|
||
|
},
|
||
|
"bias": {},
|
||
|
"output": {},
|
||
|
}
|
||
|
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication action
|
||
|
output_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['output'],
|
||
|
communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
|
||
|
logical_process_axis=mesh_dim)
|
||
|
communication_action_mapping = {'output': output_comm_spec}
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def split_rhs_space_only(self, mesh_dim):
|
||
|
name = f'RS{mesh_dim} = RR x RS{mesh_dim}'
|
||
|
|
||
|
# get sharding spec
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {},
|
||
|
"other": {
|
||
|
self.dim_q: [mesh_dim]
|
||
|
},
|
||
|
"bias": {
|
||
|
-1: [mesh_dim]
|
||
|
},
|
||
|
"output": {
|
||
|
-1: [mesh_dim]
|
||
|
},
|
||
|
}
|
||
|
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication actions
|
||
|
input_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['input'],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=mesh_dim)
|
||
|
communication_action_mapping = {'input': input_comm_spec}
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def split_lhs_1st_dim_1d(self, mesh_dim_0, mesh_dim_1):
|
||
|
name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x RR'
|
||
|
# get sharding spec
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {
|
||
|
0: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
"other": {},
|
||
|
"bias": {},
|
||
|
"output": {
|
||
|
0: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
}
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication action
|
||
|
other_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['other'],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=[mesh_dim_0, mesh_dim_1])
|
||
|
|
||
|
communcation_action_mapping = {"other": other_comm_spec}
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communcation_action_mapping)
|
||
|
|
||
|
def split_lhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
|
||
|
name = f'RR = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}R'
|
||
|
|
||
|
# get sharding spec
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {
|
||
|
-1: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
"other": {
|
||
|
self.dim_p: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
"bias": {},
|
||
|
"output": {},
|
||
|
}
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication action
|
||
|
output_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['output'],
|
||
|
communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
|
||
|
logical_process_axis=[mesh_dim_0, mesh_dim_1])
|
||
|
communication_action_mapping = {'output': output_comm_spec}
|
||
|
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def split_rhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
|
||
|
name = f'RS{mesh_dim_0}{mesh_dim_1} = RR x RS{mesh_dim_0}{mesh_dim_1}'
|
||
|
|
||
|
# get sharding spec
|
||
|
dim_partition_dict_mapping = {
|
||
|
"input": {},
|
||
|
"other": {
|
||
|
self.dim_q: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
"bias": {
|
||
|
-1: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
"output": {
|
||
|
-1: [mesh_dim_0, mesh_dim_1]
|
||
|
},
|
||
|
}
|
||
|
sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
|
||
|
|
||
|
# get communication action
|
||
|
input_comm_spec = self.get_communication_spec(
|
||
|
sharding_spec=sharding_spec_mapping['input'],
|
||
|
communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||
|
logical_process_axis=[mesh_dim_0, mesh_dim_1])
|
||
|
communication_action_mapping = {'input': input_comm_spec}
|
||
|
|
||
|
return self.get_sharding_strategy(name=name,
|
||
|
sharding_spec_mapping=sharding_spec_mapping,
|
||
|
communication_action_mapping=communication_action_mapping)
|
||
|
|
||
|
def validate(self) -> bool:
|
||
|
assert "input" in self.op_data
|
||
|
assert "other" in self.op_data
|
||
|
|
||
|
# make sure the other has 2 dim
|
||
|
input_data = self.op_data['input']
|
||
|
other_data = self.op_data['other']
|
||
|
assert input_data.data.dim() > 0 and other_data.data.dim() == 2
|
||
|
assert other_data.logical_shape[0] == input_data.logical_shape[-1]
|
||
|
|
||
|
# check if bias has the same a valid dim
|
||
|
has_bias = "bias" in self.op_data
|
||
|
|
||
|
if has_bias:
|
||
|
bias_data = self.op_data['bias']
|
||
|
assert bias_data.logical_shape[-1] == other_data.logical_shape[-1]
|
||
|
|
||
|
|
||
|
class BatchedMatMulStrategyGenerator(StrategyGenerator_V2):
|
||
|
"""TODO: to be implemented"""
|
||
|
pass
|