[autoparallel] refactor handlers which reshape input tensors (#2615)

* [autoparallel] refactor handlers which reshape input tensors * polish
2023-02-08 15:02:49 +08:00 · 2023-02-08 15:02:49 +08:00 · 37df666f38
parent 28398f1c70
commit 37df666f38
15 changed files with 307 additions and 365 deletions
--- a/colossalai/auto_parallel/tensor_shard/node_handler/init.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/init.py
@ -3,8 +3,8 @@ from .batch_norm_handler import BatchNormModuleHandler
 from .binary_elementwise_handler import BinaryElementwiseHandler
 from .bmm_handler import AddBMMFunctionHandler, BMMFunctionHandler
 from .conv_handler import ConvFunctionHandler, ConvModuleHandler
 from .default_reshape_handler import DefaultReshapeHandler
 from .embedding_handler import EmbeddingFunctionHandler, EmbeddingModuleHandler
 from .experimental import PermuteHandler, ViewHandler
 from .getattr_handler import GetattrHandler
 from .getitem_handler import GetItemHandler
 from .layer_norm_handler import LayerNormModuleHandler
@ -13,20 +13,24 @@ from .matmul_handler import MatMulHandler
 from .normal_pooling_handler import NormPoolingHandler
 from .option import ShardOption
 from .output_handler import OutputHandler
 from .permute_handler import PermuteHandler
 from .placeholder_handler import PlaceholderHandler
 from .registry import operator_registry
 from .reshape_handler import ReshapeHandler
 from .softmax_handler import SoftmaxHandler
 from .split_handler import SplitHandler
 from .sum_handler import SumHandler
 from .tensor_constructor_handler import TensorConstructorHandler
 from .transpose_handler import TransposeHandler
 from .unary_elementwise_handler import UnaryElementwiseHandler
 from .view_handler import ViewHandler
 from .where_handler import WhereHandler
 __all__ = [
    'LinearFunctionHandler', 'LinearModuleHandler', 'BMMFunctionHandler', 'AddBMMFunctionHandler',
    'LayerNormModuleHandler', 'BatchNormModuleHandler', 'ConvModuleHandler', 'ConvFunctionHandler',
-    'UnaryElementwiseHandler', 'ReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
+    'UnaryElementwiseHandler', 'DefaultReshapeHandler', 'PlaceholderHandler', 'OutputHandler', 'WhereHandler',
    'NormPoolingHandler', 'BinaryElementwiseHandler', 'MatMulHandler', 'operator_registry', 'ADDMMFunctionHandler',
    'GetItemHandler', 'GetattrHandler', 'ViewHandler', 'PermuteHandler', 'TensorConstructorHandler',
-    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption'
+    'EmbeddingModuleHandler', 'EmbeddingFunctionHandler', 'SumHandler', 'SoftmaxHandler', 'ShardOption',
    'TransposeHandler', 'SplitHandler'
 ]
--- a/colossalai/auto_parallel/tensor_shard/node_handler/default_reshape_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/default_reshape_handler.py
@ -5,23 +5,23 @@ import torch
 from ..sharding_strategy import OperationData, OperationDataType
 from .node_handler import MetaInfoNodeHandler, NodeHandler
 from .registry import operator_registry
-from .strategy import ReshapeGenerator, StrategyGenerator
+from .strategy import DefaultReshapeGenerator, StrategyGenerator
-__all__ = ['ReshapeHandler']
+__all__ = ['DefaultReshapeHandler']
@operator_registry.register(torch.flatten)
@operator_registry.register(torch.Tensor.unsqueeze)
@operator_registry.register(torch.nn.AdaptiveAvgPool2d)
-class ReshapeHandler(MetaInfoNodeHandler):
+class DefaultReshapeHandler(MetaInfoNodeHandler):
    """
-    A ReshapeHandler which deals with the sharding strategies for Reshape Op, such as torch.reshape.
+    A DefaultReshapeHandler which deals with the sharding strategies for Reshape Op, such as torch.reshape.
    """
    def get_strategy_generator(self) -> List[StrategyGenerator]:
        op_data_mapping = self.get_operation_data_mapping()
        generators = []
-        generators.append(ReshapeGenerator(op_data_mapping, self.device_mesh, self.node.args[0]))
+        generators.append(DefaultReshapeGenerator(op_data_mapping, self.device_mesh, self.node.args[0]))
        return generators
    def infer_logical_shape(self, data):
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/init.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/init.py
@ -1,10 +0,0 @@
 from .permute_handler import PermuteHandler
 from .reshape_generator import PermuteGenerator, SplitGenerator, TransposeGenerator, ViewGenerator
 from .split_handler import SplitHandler
 from .transpose_handler import TransposeHandler
 from .view_handler import ViewHandler
 __all__ = [
    'ViewGenerator', 'ViewHandler', 'PermuteGenerator', 'PermuteHandler', 'TransposeGenerator', 'TransposeGenerator',
    'SplitHandler', 'SplitGenerator'
 ]
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/reshape_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/reshape_generator.py
@ -1,299 +0,0 @@
 import copy
 from typing import List
 from colossalai.auto_parallel.tensor_shard.node_handler.strategy.strategy_generator import FollowingStrategyGenerator
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
    CommAction,
    CommType,
    MemoryCost,
    ShardingStrategy,
    TrainCycleItem,
 )
 from colossalai.auto_parallel.tensor_shard.utils import (
    check_keep_sharding_status,
    detect_reshape_mapping,
    infer_output_dim_partition_dict,
 )
 from colossalai.tensor.shape_consistency import CollectiveCommPattern
 from colossalai.tensor.sharding_spec import ShardingSpec
 __all__ = ['ReshapeGenerator', 'ViewGenerator', 'PermuteGenerator', 'TransposeGenerator', 'SplitGenerator']
 class ReshapeGenerator(FollowingStrategyGenerator):
    """
    ReshapeGenerator is the base class for all the reshape operation.
    """
    def validate(self) -> bool:
        return super().validate()
    def update_compute_cost(self, strategy: ShardingStrategy):
        compute_cost = TrainCycleItem(fwd=10, bwd=10, total=20)
        strategy.compute_cost = compute_cost
    def update_memory_cost(self, strategy: ShardingStrategy):
        '''
        Compute the memory cost per device with this specific strategy.
        '''
        forward_size_mapping = {
            'input': self._compute_size_in_bytes(strategy, "input"),
            'output': self._compute_size_in_bytes(strategy, "output")
        }
        backward_size_mapping = copy.deepcopy(forward_size_mapping)
        backward_size_mapping.pop("output")
        # compute fwd cost incurred
        # fwd_cost = input + output
        fwd_activation_cost = sum([v for k, v in forward_size_mapping.items() if not self.is_param(k)])
        fwd_parameter_cost = sum([v for k, v in forward_size_mapping.items() if self.is_param(k)])
        fwd_mem_cost = MemoryCost(activation=fwd_activation_cost, parameter=fwd_parameter_cost)
        # compute bwd cost incurred
        # bwd_cost = input_grad
        bwd_activation_cost = sum([v for k, v in backward_size_mapping.items() if not self.is_param(k)])
        bwd_parameter_cost = sum([v for k, v in backward_size_mapping.items() if self.is_param(k)])
        bwd_mem_cost = MemoryCost(activation=bwd_activation_cost, parameter=bwd_parameter_cost)
        # compute total cost
        total_mem_cost = MemoryCost(activation=fwd_activation_cost + bwd_activation_cost,
                                    parameter=fwd_parameter_cost + bwd_parameter_cost)
        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
        strategy.memory_cost = memory_cost
    def collate_strategies(self) -> List[ShardingStrategy]:
        return super().collate_strategies()
 class ViewGenerator(ReshapeGenerator):
    """
    ViewGenerator deals with the sharding strategies of view op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            origin_shape = self.op_data['input'].data.shape
            tgt_shape = self.op_data['tgt_shape'].data
            reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
            keep_sharding_status = check_keep_sharding_status(dim_partition_dict_for_input, reshape_mapping_dict)
            if keep_sharding_status:
                dim_partition_dict_for_output = infer_output_dim_partition_dict(dim_partition_dict_for_input,
                                                                                reshape_mapping_dict)
            else:
                dim_partition_dict_for_output = {}
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            if keep_sharding_status:
                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
            else:
                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> FULLY REPLICATED_{index}'
                # add comm action for converting input to fully replicated
                total_mesh_dim_list = []
                for mesh_dim_list in dim_partition_dict_for_input.values():
                    total_mesh_dim_list.extend(mesh_dim_list)
                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
                if len(total_mesh_dim_list) == 1:
                    total_mesh_dim_list = total_mesh_dim_list[0]
                    # the total mesh dim list only has one element, so the shard dim has only one element as well.
                    shard_dim = list(dim_partition_dict_for_input.keys())[0]
                    input_comm_action = self.get_communication_action(
                        sharding_spec=sharding_spec_mapping["input"],
                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
                        logical_process_axis=total_mesh_dim_list,
                        comm_type=CommType.BEFORE,
                        arg_index=0)
                    # it will gather the input through gather_dim during forward phase.
                    input_comm_action.comm_spec.gather_dim = shard_dim
                    # it will split the input activation grad through shard_dim during backward phase.
                    input_comm_action.comm_spec.shard_dim = shard_dim
                elif len(total_mesh_dim_list) >= 2:
                    source_spec = sharding_spec_mapping["input"]
                    target_spec = ShardingSpec(device_mesh=self.device_mesh,
                                               entire_shape=source_spec.entire_shape,
                                               dim_partition_dict={})
                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
                else:
                    input_comm_action = None
                if input_comm_action is not None:
                    communication_action_mapping["input"] = input_comm_action
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class PermuteGenerator(ReshapeGenerator):
    """
    PermuteGenerator deals with the sharding strategies of permute op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            permute_dims = self.op_data['permute_dims'].data
            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
            dim_partition_dict_for_output = {}
            for dim_index, permute_dim in enumerate(permute_dims):
                if permute_dim in dim_partition_dict_for_input:
                    dim_partition_dict_for_output[dim_index] = dim_partition_dict_for_input[permute_dim]
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class TransposeGenerator(ReshapeGenerator):
    """
    TransposeGenerator deals with the sharding strategies of permute op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
            dim_partition_dict_for_output = {}
            transpose_dims = self.op_data['transpose_dims'].data
            dim_0 = transpose_dims[0]
            dim_1 = transpose_dims[1]
            for dim, sharded_dims in dim_partition_dict_for_input.items():
                if dim == dim_0:
                    dim_partition_dict_for_output[dim_1] = dim_partition_dict_for_input[dim_0]
                elif dim == dim_1:
                    dim_partition_dict_for_output[dim_0] = dim_partition_dict_for_input[dim_1]
                else:
                    dim_partition_dict_for_output[dim] = sharded_dims
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class SplitGenerator(ReshapeGenerator):
    """
    SplitGenerator deals with the sharding strategies of split op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            recover_dims = None
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            dim_partition_dict_for_input = copy.deepcopy(input_sharding_spec.dim_partition_dict)
            split_size, split_dim = self.op_data['split_info'].data
            if split_dim in dim_partition_dict_for_input:
                recover_dims = dim_partition_dict_for_input.pop(split_dim)
            dim_partition_dict_for_output = [
                copy.deepcopy(dim_partition_dict_for_input) for _ in range(len(self.op_data["output"].data))
            ]
            assert len(dim_partition_dict_for_output) >= 2
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            name = f'{sharding_spec_mapping["input"].sharding_sequence}_{index}'
            # add comm action if the input need to be recovered to replica in the split dimension.
            if recover_dims:
                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
                if len(recover_dims) == 1:
                    recover_dims = recover_dims[0]
                    input_comm_action = self.get_communication_action(
                        sharding_spec=sharding_spec_mapping["input"],
                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
                        logical_process_axis=recover_dims,
                        comm_type=CommType.BEFORE,
                        arg_index=0)
                    # it will gather the input through gather_dim during forward phase.
                    input_comm_action.comm_spec.gather_dim = split_dim
                    # it will split the input activation grad through split_dim during backward phase.
                    input_comm_action.comm_spec.shard_dim = split_dim
                elif len(recover_dims) >= 2:
                    # original sharding spec
                    source_spec = input_sharding_spec
                    # target sharding spec
                    target_spec = sharding_spec_mapping["input"]
                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
                else:
                    input_comm_action = None
                if input_comm_action is not None:
                    communication_action_mapping["input"] = input_comm_action
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/permute_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/permute_handler.py
@ -2,11 +2,10 @@ from typing import Dict, List
 import torch
-from ...sharding_strategy import OperationData, OperationDataType
+from ..sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
+from .node_handler import NodeHandler
-from ..registry import operator_registry
+from .registry import operator_registry
-from ..strategy import StrategyGenerator
+from .strategy import PermuteGenerator, StrategyGenerator
 from .reshape_generator import PermuteGenerator
 __all__ = ['PermuteHandler']
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/split_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/split_handler.py
@ -2,11 +2,10 @@ from typing import Dict, List
 import torch
-from ...sharding_strategy import OperationData, OperationDataType
+from ..sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
+from .node_handler import NodeHandler
-from ..registry import operator_registry
+from .registry import operator_registry
-from ..strategy import StrategyGenerator
+from .strategy import SplitGenerator, StrategyGenerator
 from .reshape_generator import SplitGenerator
 __all__ = ['SplitHandler']
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/init.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/init.py
@ -14,7 +14,13 @@ from .matmul_strategy_generator import (
 from .normal_pooling_generator import NormalPoolStrategyGenerator
 from .output_generator import OutputGenerator
 from .placeholder_generator import PlaceholderGenerator
-from .reshape_generator import ReshapeGenerator
+from .reshape_generator import (
    DefaultReshapeGenerator,
    PermuteGenerator,
    SplitGenerator,
    TransposeGenerator,
    ViewGenerator,
 )
 from .softmax_generator import SoftmaxGenerator
 from .strategy_generator import StrategyGenerator
 from .sum_generator import SumGenerator
@ -26,7 +32,8 @@ __all__ = [
    'StrategyGenerator', 'DotProductStrategyGenerator', 'MatVecStrategyGenerator', 'LinearProjectionStrategyGenerator',
    'BatchedMatMulStrategyGenerator', 'ConvStrategyGenerator', 'UnaryElementwiseGenerator',
    'BatchNormStrategyGenerator', 'GetItemStrategyGenerator', 'TensorStrategyGenerator', 'TensorTupleStrategyGenerator',
-    'LayerNormGenerator', 'ReshapeGenerator', 'PlaceholderGenerator', 'OutputGenerator', 'WhereGenerator',
+    'LayerNormGenerator', 'PlaceholderGenerator', 'OutputGenerator', 'WhereGenerator', 'NormalPoolStrategyGenerator',
-    'ReshapeGenerator', 'NormalPoolStrategyGenerator', 'BinaryElementwiseStrategyGenerator', 'GetattrGenerator',
+    'BinaryElementwiseStrategyGenerator', 'GetattrGenerator', 'TensorConstructorGenerator',
-    'TensorConstructorGenerator', 'EmbeddingStrategyGenerator', 'SumGenerator', 'SoftmaxGenerator'
+    'EmbeddingStrategyGenerator', 'SumGenerator', 'SoftmaxGenerator', 'ViewGenerator', 'PermuteGenerator',
    'TransposeGenerator', 'SplitGenerator', 'DefaultReshapeGenerator'
 ]
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/reshape_generator.py
@ -1,6 +1,7 @@
 import copy
 from typing import List
 from colossalai.auto_parallel.tensor_shard.node_handler.strategy.strategy_generator import FollowingStrategyGenerator
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
    CommAction,
    CommType,
@ -8,17 +9,20 @@ from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
    ShardingStrategy,
    TrainCycleItem,
 )
 from colossalai.auto_parallel.tensor_shard.utils import (
    check_keep_sharding_status,
    detect_reshape_mapping,
    infer_output_dim_partition_dict,
 )
 from colossalai.tensor.shape_consistency import CollectiveCommPattern
 from colossalai.tensor.sharding_spec import ShardingSpec
-from .strategy_generator import FollowingStrategyGenerator
+__all__ = ['ReshapeGenerator', 'ViewGenerator', 'PermuteGenerator', 'TransposeGenerator', 'SplitGenerator']
 __all__ = ['ReshapeGenerator']
 class ReshapeGenerator(FollowingStrategyGenerator):
    """
-    ReshapeGenerator which deals with the sharding strategies of Reshape Op, such as torch.Tensor.permute.
+    ReshapeGenerator is the base class for all the reshape operation.
    """
    def validate(self) -> bool:
@ -57,11 +61,255 @@ class ReshapeGenerator(FollowingStrategyGenerator):
        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
        strategy.memory_cost = memory_cost
    def collate_strategies(self) -> List[ShardingStrategy]:
        return super().collate_strategies()
 class ViewGenerator(ReshapeGenerator):
    """
    ViewGenerator deals with the sharding strategies of view op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
-        # For reshape function, to keep the computing correctness we keep the sharding
+        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
-        # spec of input is fully replicated. In addition, we will keep the output in
+            dim_partition_dict_mapping = {}
-        # replica status and let the successor node choose the way to resharding the
+            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            origin_shape = self.op_data['input'].data.shape
            tgt_shape = self.op_data['tgt_shape'].data
            reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
            keep_sharding_status = check_keep_sharding_status(dim_partition_dict_for_input, reshape_mapping_dict)
            if keep_sharding_status:
                dim_partition_dict_for_output = infer_output_dim_partition_dict(dim_partition_dict_for_input,
                                                                                reshape_mapping_dict)
            else:
                dim_partition_dict_for_output = {}
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            if keep_sharding_status:
                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
            else:
                name = f'{sharding_spec_mapping["input"].sharding_sequence} -> FULLY REPLICATED_{index}'
                # add comm action for converting input to fully replicated
                total_mesh_dim_list = []
                for mesh_dim_list in dim_partition_dict_for_input.values():
                    total_mesh_dim_list.extend(mesh_dim_list)
                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
                if len(total_mesh_dim_list) == 1:
                    total_mesh_dim_list = total_mesh_dim_list[0]
                    # the total mesh dim list only has one element, so the shard dim has only one element as well.
                    shard_dim = list(dim_partition_dict_for_input.keys())[0]
                    input_comm_action = self.get_communication_action(
                        sharding_spec=sharding_spec_mapping["input"],
                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
                        logical_process_axis=total_mesh_dim_list,
                        comm_type=CommType.BEFORE,
                        arg_index=0)
                    # it will gather the input through gather_dim during forward phase.
                    input_comm_action.comm_spec.gather_dim = shard_dim
                    # it will split the input activation grad through shard_dim during backward phase.
                    input_comm_action.comm_spec.shard_dim = shard_dim
                elif len(total_mesh_dim_list) >= 2:
                    source_spec = sharding_spec_mapping["input"]
                    target_spec = ShardingSpec(device_mesh=self.device_mesh,
                                               entire_shape=source_spec.entire_shape,
                                               dim_partition_dict={})
                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
                else:
                    input_comm_action = None
                if input_comm_action is not None:
                    communication_action_mapping["input"] = input_comm_action
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class PermuteGenerator(ReshapeGenerator):
    """
    PermuteGenerator deals with the sharding strategies of permute op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            permute_dims = self.op_data['permute_dims'].data
            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
            dim_partition_dict_for_output = {}
            for dim_index, permute_dim in enumerate(permute_dims):
                if permute_dim in dim_partition_dict_for_input:
                    dim_partition_dict_for_output[dim_index] = dim_partition_dict_for_input[permute_dim]
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class TransposeGenerator(ReshapeGenerator):
    """
    TransposeGenerator deals with the sharding strategies of permute op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            dim_partition_dict_for_input = input_sharding_spec.dim_partition_dict
            dim_partition_dict_for_output = {}
            transpose_dims = self.op_data['transpose_dims'].data
            dim_0 = transpose_dims[0]
            dim_1 = transpose_dims[1]
            for dim, sharded_dims in dim_partition_dict_for_input.items():
                if dim == dim_0:
                    dim_partition_dict_for_output[dim_1] = dim_partition_dict_for_input[dim_0]
                elif dim == dim_1:
                    dim_partition_dict_for_output[dim_0] = dim_partition_dict_for_input[dim_1]
                else:
                    dim_partition_dict_for_output[dim] = sharded_dims
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            name = f'{sharding_spec_mapping["input"].sharding_sequence} -> {sharding_spec_mapping["output"].sharding_sequence}_{index}'
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class SplitGenerator(ReshapeGenerator):
    """
    SplitGenerator deals with the sharding strategies of split op.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
            recover_dims = None
            dim_partition_dict_mapping = {}
            communication_action_mapping = {}
            input_sharding_spec = strategy.output_sharding_specs[self.op_data["input"]]
            dim_partition_dict_for_input = copy.deepcopy(input_sharding_spec.dim_partition_dict)
            split_size, split_dim = self.op_data['split_info'].data
            if split_dim in dim_partition_dict_for_input:
                recover_dims = dim_partition_dict_for_input.pop(split_dim)
            dim_partition_dict_for_output = [
                copy.deepcopy(dim_partition_dict_for_input) for _ in range(len(self.op_data["output"].data))
            ]
            assert len(dim_partition_dict_for_output) >= 2
            dim_partition_dict_mapping = {
                "input": dim_partition_dict_for_input,
                "output": dim_partition_dict_for_output,
            }
            sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
            # add index into name to pass the duplicated check
            # we keep same strategies with different name for node merging, and it will not increase the searching space,
            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
            name = f'{sharding_spec_mapping["input"].sharding_sequence}_{index}'
            # add comm action if the input need to be recovered to replica in the split dimension.
            if recover_dims:
                # if there is only one sharding dimension, we should use the value instead of list as logical_process_axis.
                if len(recover_dims) == 1:
                    recover_dims = recover_dims[0]
                    input_comm_action = self.get_communication_action(
                        sharding_spec=sharding_spec_mapping["input"],
                        communication_pattern=CollectiveCommPattern.GATHER_FWD_SPLIT_BWD,
                        logical_process_axis=recover_dims,
                        comm_type=CommType.BEFORE,
                        arg_index=0)
                    # it will gather the input through gather_dim during forward phase.
                    input_comm_action.comm_spec.gather_dim = split_dim
                    # it will split the input activation grad through split_dim during backward phase.
                    input_comm_action.comm_spec.shard_dim = split_dim
                elif len(recover_dims) >= 2:
                    # original sharding spec
                    source_spec = input_sharding_spec
                    # target sharding spec
                    target_spec = sharding_spec_mapping["input"]
                    comm_spec = {'src_spec': source_spec, 'tgt_spec': target_spec}
                    input_comm_action = CommAction(comm_spec=comm_spec, comm_type=CommType.BEFORE, arg_index=0)
                else:
                    input_comm_action = None
                if input_comm_action is not None:
                    communication_action_mapping["input"] = input_comm_action
            strategy = self.get_sharding_strategy(name=name,
                                                  sharding_spec_mapping=sharding_spec_mapping,
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        return strategy_list
 class DefaultReshapeGenerator(ReshapeGenerator):
    """
    DefaultReshapeGenerator which deals with the sharding strategies of Reshape Op which have to recover the tensor
    to Replica status.
    """
    def collate_strategies(self) -> List[ShardingStrategy]:
        strategy_list = []
        # For default reshape strategy, to keep the computing correctness we keep the
        # sharding spec of input is fully replicated. In addition, we will keep the output
        # in replica status and let the successor node choose the way to resharding the
        # output node. Therefore, the different strategies of input node with same
        # output sharding spec will generate same strategy for reshape function.
        for index, strategy in enumerate(self.predecessor_node.strategies_vector):
@ -114,9 +362,4 @@ class ReshapeGenerator(FollowingStrategyGenerator):
                                                  communication_action_mapping=communication_action_mapping)
            strategy_list.append(strategy)
        for strategy in strategy_list:
            self.update_communication_cost(strategy)
            self.update_compute_cost(strategy)
            self.update_memory_cost(strategy)
        return strategy_list
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/transpose_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/transpose_handler.py
@ -2,11 +2,10 @@ from typing import Dict, List
 import torch
-from ...sharding_strategy import OperationData, OperationDataType
+from ..sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
+from .node_handler import NodeHandler
-from ..registry import operator_registry
+from .registry import operator_registry
-from ..strategy import StrategyGenerator
+from .strategy import StrategyGenerator, TransposeGenerator
 from .reshape_generator import TransposeGenerator
 __all__ = ['TransposeHandler']
--- a/colossalai/auto_parallel/tensor_shard/node_handler/experimental/view_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/experimental/view_handler.py
@ -2,11 +2,10 @@ from typing import Dict, List
 import torch
-from ...sharding_strategy import OperationData, OperationDataType
+from ..sharding_strategy import OperationData, OperationDataType
-from ..node_handler import NodeHandler
+from .node_handler import NodeHandler
-from ..registry import operator_registry
+from .registry import operator_registry
-from ..strategy import StrategyGenerator
+from .strategy import StrategyGenerator, ViewGenerator
 from .reshape_generator import ViewGenerator
 __all__ = ['ViewHandler']
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_default_reshape_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_default_reshape_handler.py
@ -1,8 +1,8 @@
 import torch
 import torch.nn as nn
 from colossalai.auto_parallel.tensor_shard.node_handler import DefaultReshapeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.reshape_handler import ReshapeHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
@ -51,9 +51,9 @@ def test_reshape_handler():
                                       strategies_vector=conv_strategies_vector)
    conv_handler.register_strategy(compute_resharding_cost=False)
    setattr(conv_mod_node, 'strategies_vector', conv_strategies_vector)
-    reshape_handler = ReshapeHandler(node=reshape_node,
+    reshape_handler = DefaultReshapeHandler(node=reshape_node,
-                                     device_mesh=device_mesh,
+                                            device_mesh=device_mesh,
-                                     strategies_vector=reshape_strategies_vector)
+                                            strategies_vector=reshape_strategies_vector)
    reshape_handler.register_strategy(compute_resharding_cost=False)
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_getitem_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_getitem_handler.py
@ -5,10 +5,10 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from colossalai.auto_parallel.tensor_shard.node_handler.default_reshape_handler import DefaultReshapeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.getitem_handler import GetItemHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.placeholder_handler import PlaceholderHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.reshape_handler import ReshapeHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx import ColoGraphModule, ColoTracer
@ -153,7 +153,9 @@ def test_getitem_from_tuple_handler():
    )
    input_handler.register_strategy(compute_resharding_cost=False)
    setattr(input_node, 'strategies_vector', input_strategies_vector)
-    split_handler = ReshapeHandler(node=split_node, device_mesh=device_mesh, strategies_vector=split_strategies_vector)
+    split_handler = DefaultReshapeHandler(node=split_node,
                                          device_mesh=device_mesh,
                                          strategies_vector=split_strategies_vector)
    split_handler.register_strategy(compute_resharding_cost=False)
    setattr(split_node, 'strategies_vector', split_strategies_vector)
    getitem_handler = GetItemHandler(node=getitem_node,
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_permute_and_transpose_handler.py
@ -5,8 +5,8 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from colossalai.auto_parallel.tensor_shard.node_handler import PermuteHandler, TransposeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.experimental import PermuteHandler, TransposeHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_split_handler.py
@ -5,8 +5,8 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from colossalai.auto_parallel.tensor_shard.node_handler import SplitHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.experimental import SplitHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh
@ -156,8 +156,7 @@ def check_split_handler(rank, split_size, split_dim, model_cls, world_size, port
    # reshape handler is a following strategy handler, so the number of strategies is equal to the predecessor node.
    assert len(split_strategies_vector) == len(previous_strategies_vector)
    strategy_name_list = [strategy.name for strategy in split_strategies_vector]
-    for name in strategy_name_list:
+
        print(name)
    if model_cls.__name__ == 'ConvSplitModel':
        if split_dim == 0:
--- a/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_node_handler/test_view_handler.py
@ -5,8 +5,8 @@ import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from colossalai.auto_parallel.tensor_shard.node_handler import ViewHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.conv_handler import ConvFunctionHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.experimental import ViewHandler
 from colossalai.auto_parallel.tensor_shard.node_handler.linear_handler import LinearFunctionHandler
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, OperationDataType, StrategiesVector
 from colossalai.device.device_mesh import DeviceMesh