ColossalAI/colossalai/auto_parallel/solver/strategy/matmul_strategy_generator.py

from cmath import log
from distutils.log import Log
import operator
import torch
from functools import reduce
from ..sharding_strategy import ShardingStrategy_V2, TrainCycleItem, MemoryCost
from colossalai.tensor.shape_consistency import CollectiveCommPattern
from .strategy_generator import StrategyGenerator_V2
from typing import List


class DotProductStrategyGenerator(StrategyGenerator_V2):
    """TODO: to be implemented"""
    pass


class MatVecStrategyGenerator(StrategyGenerator_V2):
    """TODO: to be implemented"""
    pass


class LinearProjectionStrategyGenerator(StrategyGenerator_V2):

    def update_compute_cost(self, strategy: ShardingStrategy_V2) -> ShardingStrategy_V2:
        # C = AB
        # C: [M, N], A: [M, P], B: [P, N]
        # fwd cost = MNP (only count mul)
        # bwd: 2 x fwd_cost
        sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
        sharded_other_shape = strategy.sharding_specs[self.op_data['other']].get_sharded_shape_per_device()
        dim_m_val = reduce(operator.mul, sharded_input_shape[:-1])
        dim_n_val = sharded_other_shape[-1]
        dim_p_val = sharded_other_shape[0]

        fwd_compute_cost = dim_m_val * dim_n_val * dim_p_val
        bwd_compute_cost = fwd_compute_cost * 2
        compute_cost = TrainCycleItem(fwd=bwd_compute_cost,
                                      bwd=bwd_compute_cost,
                                      total=fwd_compute_cost + bwd_compute_cost)
        strategy.compute_cost = compute_cost

    def update_memory_cost(self, strategy: ShardingStrategy_V2) -> ShardingStrategy_V2:
        input_size = self._compute_size_in_bytes(strategy, "input")
        other_size = self._compute_size_in_bytes(strategy, "input")

        if "bias" in self.op_data:
            bias_size = self._compute_size_in_bytes(strategy, "bias")
        else:
            bias_size = 0
        output_size = self._compute_size_in_bytes(strategy, "output")

        fwd_mem_cost = MemoryCost(activation=output_size, parameter=other_size + bias_size)
        bwd_mem_cost = MemoryCost(activation=input_size + other_size + bias_size, parameter=other_size)
        total_mem_cost = MemoryCost(activation=input_size + 2 * output_size + bias_size,
                                    parameter=other_size + bias_size)
        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
        strategy.memory_cost = memory_cost

    def generate(self) -> List[ShardingStrategy_V2]:
        strategies = []

        # SS = SR x RS
        strategies.append(self.split_lhs_space_rhs_space(0, 1))
        strategies.append(self.split_lhs_space_rhs_space(1, 0))

        # SR = SS x SR
        strategies.append(self.split_lhs_space_both_contract(0, 1))
        strategies.append(self.split_lhs_space_both_contract(1, 0))

        # RS = RS x SS
        strategies.append(self.split_rhs_space_both_contract(0, 1))
        strategies.append(self.split_rhs_space_both_contract(1, 0))

        # RR= RS x SR
        strategies.append(self.recompute_split_both_contract(0))
        strategies.append(self.recompute_split_both_contract(1))

        # RS = RR x RS
        strategies.append(self.split_rhs_space_only(0))
        strategies.append(self.split_rhs_space_only(1))

        # S01R = S01R x RR
        strategies.append(self.split_lhs_1st_dim_1d(0, 1))

        # RR = RS01 x S01R
        strategies.append(self.split_lhs_2nd_dim_1d(0, 1))

        # RS01 = RR x RS01
        strategies.append(self.split_rhs_2nd_dim_1d(0, 1))

        # update mete info on cost
        for strategy in strategies:
            self.update_communication_cost(strategy)
            self.update_compute_cost(strategy)
            self.update_memory_cost(strategy)

        return strategies

    def split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
        # handle case SS = SR x RS
        name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}R x RS{mesh_dim_1}'
        dim_partition_dict_mapping = {
            "input": {
                0: [mesh_dim_0]
            },
            "other": {
                self.dim_q: [mesh_dim_1]
            },
            "bias": {
                -1: [mesh_dim_1]
            },
            "output": {
                0: [mesh_dim_0],
                -1: [mesh_dim_1]
            },
        }
        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # set communication action
        input_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping["input"],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=mesh_dim_1)
        other_comm_spec = self.get_communication_spec(
            sharding_spec_mapping["output"],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=mesh_dim_0)

        communication_action_mapping = {"input": input_comm_spec, "other": other_comm_spec}

        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
        # handle the case SR = SS x SR
        name = f'S{mesh_dim_0}R = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1}R'

        # get sharding spec mapping
        dim_partition_dict_mapping = {
            "input": {
                0: [mesh_dim_0],
                -1: [mesh_dim_1]
            },
            "other": {
                self.dim_p: [mesh_dim_1]
            },
            "bias": {},
            "output": {
                0: [mesh_dim_0]
            },
        }
        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication action mapping
        input_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping["input"],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=mesh_dim_0)
        output_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping["output"],
            communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
            logical_process_axis=mesh_dim_1)

        communication_action_mapping = {"input": input_comm_spec, 'output': output_comm_spec}

        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
        name = f'RS{mesh_dim_1} = RS{mesh_dim_0} x S{mesh_dim_0}S{mesh_dim_1}'

        # get sharding specs
        dim_partition_dict_mapping = {
            "input": {
                -1: [mesh_dim_0]
            },
            "other": {
                self.dim_p: [mesh_dim_0],
                self.dim_q: [mesh_dim_1]
            },
            "bias": {
                -1: [mesh_dim_1]
            },
            "output": {
                -1: [mesh_dim_1]
            },
        }
        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication actions
        output_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['output'],
            communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
            logical_process_axis=mesh_dim_0)
        input_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['input'],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=mesh_dim_1)
        communication_action_mapping = {"output": output_comm_spec, "input": input_comm_spec}
        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def recompute_split_both_contract(self, mesh_dim):
        name = f'RR = RS{mesh_dim} x S{mesh_dim}R'

        # get sharding spec
        dim_partition_dict_mapping = {
            "input": {
                -1: [mesh_dim]
            },
            "other": {
                self.dim_p: [mesh_dim]
            },
            "bias": {},
            "output": {},
        }

        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication action
        output_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['output'],
            communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
            logical_process_axis=mesh_dim)
        communication_action_mapping = {'output': output_comm_spec}
        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def split_rhs_space_only(self, mesh_dim):
        name = f'RS{mesh_dim} = RR x RS{mesh_dim}'

        # get sharding spec
        dim_partition_dict_mapping = {
            "input": {},
            "other": {
                self.dim_q: [mesh_dim]
            },
            "bias": {
                -1: [mesh_dim]
            },
            "output": {
                -1: [mesh_dim]
            },
        }

        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication actions
        input_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['input'],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=mesh_dim)
        communication_action_mapping = {'input': input_comm_spec}
        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def split_lhs_1st_dim_1d(self, mesh_dim_0, mesh_dim_1):
        name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x RR'
        # get sharding spec
        dim_partition_dict_mapping = {
            "input": {
                0: [mesh_dim_0, mesh_dim_1]
            },
            "other": {},
            "bias": {},
            "output": {
                0: [mesh_dim_0, mesh_dim_1]
            },
        }
        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication action
        other_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['other'],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=[mesh_dim_0, mesh_dim_1])

        communcation_action_mapping = {"other": other_comm_spec}
        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communcation_action_mapping)

    def split_lhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
        name = f'RR = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}R'

        # get sharding spec
        dim_partition_dict_mapping = {
            "input": {
                -1: [mesh_dim_0, mesh_dim_1]
            },
            "other": {
                self.dim_p: [mesh_dim_0, mesh_dim_1]
            },
            "bias": {},
            "output": {},
        }
        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication action
        output_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['output'],
            communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,
            logical_process_axis=[mesh_dim_0, mesh_dim_1])
        communication_action_mapping = {'output': output_comm_spec}

        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def split_rhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):
        name = f'RS{mesh_dim_0}{mesh_dim_1} = RR x RS{mesh_dim_0}{mesh_dim_1}'

        # get sharding spec
        dim_partition_dict_mapping = {
            "input": {},
            "other": {
                self.dim_q: [mesh_dim_0, mesh_dim_1]
            },
            "bias": {
                -1: [mesh_dim_0, mesh_dim_1]
            },
            "output": {
                -1: [mesh_dim_0, mesh_dim_1]
            },
        }
        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)

        # get communication action
        input_comm_spec = self.get_communication_spec(
            sharding_spec=sharding_spec_mapping['input'],
            communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
            logical_process_axis=[mesh_dim_0, mesh_dim_1])
        communication_action_mapping = {'input': input_comm_spec}

        return self.get_sharding_strategy(name=name,
                                          sharding_spec_mapping=sharding_spec_mapping,
                                          communication_action_mapping=communication_action_mapping)

    def validate(self) -> bool:
        assert "input" in self.op_data
        assert "other" in self.op_data

        # make sure the other has 2 dim
        input_data = self.op_data['input']
        other_data = self.op_data['other']
        assert input_data.data.dim() > 0 and other_data.data.dim() == 2
        assert other_data.logical_shape[0] == input_data.logical_shape[-1]

        # check if bias has the same a valid dim
        has_bias = "bias" in self.op_data

        if has_bias:
            bias_data = self.op_data['bias']
            assert bias_data.logical_shape[-1] == other_data.logical_shape[-1]


class BatchedMatMulStrategyGenerator(StrategyGenerator_V2):
    """TODO: to be implemented"""
    pass
[autoparallel] implemented linear projection strategy generator (#1639) 2022-09-26 08:58:14 +00:00			`from cmath import log`
			`from distutils.log import Log`
			`import operator`
			`import torch`
			`from functools import reduce`
			`from ..sharding_strategy import ShardingStrategy_V2, TrainCycleItem, MemoryCost`
			`from colossalai.tensor.shape_consistency import CollectiveCommPattern`
			`from .strategy_generator import StrategyGenerator_V2`
			`from typing import List`


			`class DotProductStrategyGenerator(StrategyGenerator_V2):`
			`"""TODO: to be implemented"""`
			`pass`


			`class MatVecStrategyGenerator(StrategyGenerator_V2):`
			`"""TODO: to be implemented"""`
			`pass`


			`class LinearProjectionStrategyGenerator(StrategyGenerator_V2):`

			`def update_compute_cost(self, strategy: ShardingStrategy_V2) -> ShardingStrategy_V2:`
			`# C = AB`
			`# C: [M, N], A: [M, P], B: [P, N]`
			`# fwd cost = MNP (only count mul)`
			`# bwd: 2 x fwd_cost`
			`sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()`
			`sharded_other_shape = strategy.sharding_specs[self.op_data['other']].get_sharded_shape_per_device()`
			`dim_m_val = reduce(operator.mul, sharded_input_shape[:-1])`
			`dim_n_val = sharded_other_shape[-1]`
			`dim_p_val = sharded_other_shape[0]`

			`fwd_compute_cost = dim_m_val * dim_n_val * dim_p_val`
			`bwd_compute_cost = fwd_compute_cost * 2`
			`compute_cost = TrainCycleItem(fwd=bwd_compute_cost,`
			`bwd=bwd_compute_cost,`
			`total=fwd_compute_cost + bwd_compute_cost)`
			`strategy.compute_cost = compute_cost`

			`def update_memory_cost(self, strategy: ShardingStrategy_V2) -> ShardingStrategy_V2:`
			`input_size = self._compute_size_in_bytes(strategy, "input")`
			`other_size = self._compute_size_in_bytes(strategy, "input")`

			`if "bias" in self.op_data:`
			`bias_size = self._compute_size_in_bytes(strategy, "bias")`
			`else:`
			`bias_size = 0`
			`output_size = self._compute_size_in_bytes(strategy, "output")`

			`fwd_mem_cost = MemoryCost(activation=output_size, parameter=other_size + bias_size)`
			`bwd_mem_cost = MemoryCost(activation=input_size + other_size + bias_size, parameter=other_size)`
			`total_mem_cost = MemoryCost(activation=input_size + 2 * output_size + bias_size,`
			`parameter=other_size + bias_size)`
			`memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)`
			`strategy.memory_cost = memory_cost`

			`def generate(self) -> List[ShardingStrategy_V2]:`
			`strategies = []`

			`# SS = SR x RS`
			`strategies.append(self.split_lhs_space_rhs_space(0, 1))`
			`strategies.append(self.split_lhs_space_rhs_space(1, 0))`

			`# SR = SS x SR`
			`strategies.append(self.split_lhs_space_both_contract(0, 1))`
			`strategies.append(self.split_lhs_space_both_contract(1, 0))`

			`# RS = RS x SS`
			`strategies.append(self.split_rhs_space_both_contract(0, 1))`
			`strategies.append(self.split_rhs_space_both_contract(1, 0))`

			`# RR= RS x SR`
			`strategies.append(self.recompute_split_both_contract(0))`
			`strategies.append(self.recompute_split_both_contract(1))`

			`# RS = RR x RS`
			`strategies.append(self.split_rhs_space_only(0))`
			`strategies.append(self.split_rhs_space_only(1))`

			`# S01R = S01R x RR`
			`strategies.append(self.split_lhs_1st_dim_1d(0, 1))`

			`# RR = RS01 x S01R`
			`strategies.append(self.split_lhs_2nd_dim_1d(0, 1))`

			`# RS01 = RR x RS01`
			`strategies.append(self.split_rhs_2nd_dim_1d(0, 1))`

			`# update mete info on cost`
			`for strategy in strategies:`
			`self.update_communication_cost(strategy)`
			`self.update_compute_cost(strategy)`
			`self.update_memory_cost(strategy)`

			`return strategies`

			`def split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):`
			`# handle case SS = SR x RS`
			`name = f'S{mesh_dim_0}S{mesh_dim_1} = S{mesh_dim_0}R x RS{mesh_dim_1}'`
			`dim_partition_dict_mapping = {`
			`"input": {`
			`0: [mesh_dim_0]`
			`},`
			`"other": {`
			`self.dim_q: [mesh_dim_1]`
			`},`
			`"bias": {`
			`-1: [mesh_dim_1]`
			`},`
			`"output": {`
			`0: [mesh_dim_0],`
			`-1: [mesh_dim_1]`
			`},`
			`}`
			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# set communication action`
			`input_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping["input"],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=mesh_dim_1)`
			`other_comm_spec = self.get_communication_spec(`
			`sharding_spec_mapping["output"],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=mesh_dim_0)`

			`communication_action_mapping = {"input": input_comm_spec, "other": other_comm_spec}`

			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):`
			`# handle the case SR = SS x SR`
			`name = f'S{mesh_dim_0}R = S{mesh_dim_0}S{mesh_dim_1} x S{mesh_dim_1}R'`

			`# get sharding spec mapping`
			`dim_partition_dict_mapping = {`
			`"input": {`
			`0: [mesh_dim_0],`
			`-1: [mesh_dim_1]`
			`},`
			`"other": {`
			`self.dim_p: [mesh_dim_1]`
			`},`
			`"bias": {},`
			`"output": {`
			`0: [mesh_dim_0]`
			`},`
			`}`
			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication action mapping`
			`input_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping["input"],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=mesh_dim_0)`
			`output_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping["output"],`
			`communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,`
			`logical_process_axis=mesh_dim_1)`

			`communication_action_mapping = {"input": input_comm_spec, 'output': output_comm_spec}`

			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):`
			`name = f'RS{mesh_dim_1} = RS{mesh_dim_0} x S{mesh_dim_0}S{mesh_dim_1}'`

			`# get sharding specs`
			`dim_partition_dict_mapping = {`
			`"input": {`
			`-1: [mesh_dim_0]`
			`},`
			`"other": {`
			`self.dim_p: [mesh_dim_0],`
			`self.dim_q: [mesh_dim_1]`
			`},`
			`"bias": {`
			`-1: [mesh_dim_1]`
			`},`
			`"output": {`
			`-1: [mesh_dim_1]`
			`},`
			`}`
			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication actions`
			`output_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['output'],`
			`communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,`
			`logical_process_axis=mesh_dim_0)`
			`input_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['input'],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=mesh_dim_1)`
			`communication_action_mapping = {"output": output_comm_spec, "input": input_comm_spec}`
			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def recompute_split_both_contract(self, mesh_dim):`
			`name = f'RR = RS{mesh_dim} x S{mesh_dim}R'`

			`# get sharding spec`
			`dim_partition_dict_mapping = {`
			`"input": {`
			`-1: [mesh_dim]`
			`},`
			`"other": {`
			`self.dim_p: [mesh_dim]`
			`},`
			`"bias": {},`
			`"output": {},`
			`}`

			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication action`
			`output_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['output'],`
			`communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,`
			`logical_process_axis=mesh_dim)`
			`communication_action_mapping = {'output': output_comm_spec}`
			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def split_rhs_space_only(self, mesh_dim):`
			`name = f'RS{mesh_dim} = RR x RS{mesh_dim}'`

			`# get sharding spec`
			`dim_partition_dict_mapping = {`
			`"input": {},`
			`"other": {`
			`self.dim_q: [mesh_dim]`
			`},`
			`"bias": {`
			`-1: [mesh_dim]`
			`},`
			`"output": {`
			`-1: [mesh_dim]`
			`},`
			`}`

			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication actions`
			`input_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['input'],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=mesh_dim)`
			`communication_action_mapping = {'input': input_comm_spec}`
			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def split_lhs_1st_dim_1d(self, mesh_dim_0, mesh_dim_1):`
			`name = f'S{mesh_dim_0}{mesh_dim_1}R = S{mesh_dim_0}{mesh_dim_1}R x RR'`
			`# get sharding spec`
			`dim_partition_dict_mapping = {`
			`"input": {`
			`0: [mesh_dim_0, mesh_dim_1]`
			`},`
			`"other": {},`
			`"bias": {},`
			`"output": {`
			`0: [mesh_dim_0, mesh_dim_1]`
			`},`
			`}`
			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication action`
			`other_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['other'],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=[mesh_dim_0, mesh_dim_1])`

			`communcation_action_mapping = {"other": other_comm_spec}`
			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communcation_action_mapping)`

			`def split_lhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):`
			`name = f'RR = RS{mesh_dim_0}{mesh_dim_1} x S{mesh_dim_0}{mesh_dim_1}R'`

			`# get sharding spec`
			`dim_partition_dict_mapping = {`
			`"input": {`
			`-1: [mesh_dim_0, mesh_dim_1]`
			`},`
			`"other": {`
			`self.dim_p: [mesh_dim_0, mesh_dim_1]`
			`},`
			`"bias": {},`
			`"output": {},`
			`}`
			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication action`
			`output_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['output'],`
			`communication_pattern=CollectiveCommPattern.REDUCE_FWD_IDENTITY_BWD,`
			`logical_process_axis=[mesh_dim_0, mesh_dim_1])`
			`communication_action_mapping = {'output': output_comm_spec}`

			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def split_rhs_2nd_dim_1d(self, mesh_dim_0, mesh_dim_1):`
			`name = f'RS{mesh_dim_0}{mesh_dim_1} = RR x RS{mesh_dim_0}{mesh_dim_1}'`

			`# get sharding spec`
			`dim_partition_dict_mapping = {`
			`"input": {},`
			`"other": {`
			`self.dim_q: [mesh_dim_0, mesh_dim_1]`
			`},`
			`"bias": {`
			`-1: [mesh_dim_0, mesh_dim_1]`
			`},`
			`"output": {`
			`-1: [mesh_dim_0, mesh_dim_1]`
			`},`
			`}`
			`sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)`

			`# get communication action`
			`input_comm_spec = self.get_communication_spec(`
			`sharding_spec=sharding_spec_mapping['input'],`
			`communication_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,`
			`logical_process_axis=[mesh_dim_0, mesh_dim_1])`
			`communication_action_mapping = {'input': input_comm_spec}`

			`return self.get_sharding_strategy(name=name,`
			`sharding_spec_mapping=sharding_spec_mapping,`
			`communication_action_mapping=communication_action_mapping)`

			`def validate(self) -> bool:`
			`assert "input" in self.op_data`
			`assert "other" in self.op_data`

			`# make sure the other has 2 dim`
			`input_data = self.op_data['input']`
			`other_data = self.op_data['other']`
			`assert input_data.data.dim() > 0 and other_data.data.dim() == 2`
			`assert other_data.logical_shape[0] == input_data.logical_shape[-1]`

			`# check if bias has the same a valid dim`
			`has_bias = "bias" in self.op_data`

			`if has_bias:`
			`bias_data = self.op_data['bias']`
			`assert bias_data.logical_shape[-1] == other_data.logical_shape[-1]`


			`class BatchedMatMulStrategyGenerator(StrategyGenerator_V2):`
			`"""TODO: to be implemented"""`
			`pass`