[autoparallel] protect bcast handler from invalid strategies (#1631)

2022-09-23 12:12:49 +08:00 · 2022-09-23 12:12:49 +08:00 · 69448f64c4
parent 0c703189b9
commit 69448f64c4
1 changed files with 19 additions and 8 deletions
--- a/colossalai/auto_parallel/solver/op_handler/bcast_op_handler.py
+++ b/colossalai/auto_parallel/solver/op_handler/bcast_op_handler.py
@ -40,6 +40,11 @@ class BcastOpHandler(OperatorHandler):
        for dim_index, _ in dim_partition_dict.items():
            if shape[dim_index] == 1:
                processed_dim_partition_dict.pop(dim_index)
+        for dim_index, sharding_index_list in processed_dim_partition_dict.items():
+            sharding_list = [self.device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
+            sharding_size = reduce(operator.mul, sharding_list, 1)
+            assert shape[
+                dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
        sharding_spec = ShardingSpec(device_mesh=self.device_mesh,
                                     entire_shape=shape,
                                     dim_partition_dict=processed_dim_partition_dict)
@ -83,14 +88,10 @@ class BcastOpHandler(OperatorHandler):
                                                       entire_shape=new_entire_shape,
                                                       dim_partition_dict=new_dim_partition_dict)

-                # compute the resharding cost during forward phase
-                _, _, resharding_cost_forward = shape_consistency_manager.shape_consistency(
+                # compute the resharding cost
+                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
                    input_sharding_spec, input_spec)

-                _, _, resharding_cost_backward = shape_consistency_manager.shape_consistency(
-                    input_spec, input_sharding_spec)
-                total_resharding_cost = resharding_cost_forward + resharding_cost_backward
-
                # we need multiply the size of elem dtype to get correct communication cost
                resharding_cost = total_resharding_cost * size_per_elem_bytes
                resharding_costs[input_node].append(resharding_cost)
@ -102,7 +103,11 @@ class BcastOpHandler(OperatorHandler):
        sharding_spec_list = []
        check_duplicated_list = []
        for output_dim_partition_dict in dim_partition_list:
-            output_sharding_spec = self._generate_sharding_spec(self.output_data, output_dim_partition_dict)
+            try:
+                output_sharding_spec = self._generate_sharding_spec(self.output_data, output_dim_partition_dict)
+            except AssertionError as e:
+                warnings.warn(f'{e}')
+                break
            sharding_seq = output_sharding_spec.sharding_sequence
            if sharding_seq not in check_duplicated_list:
                check_duplicated_list.append(sharding_seq)
@ -166,7 +171,7 @@ class BcastOpHandler(OperatorHandler):
    ##############################################
    #used to generate strategies for torch.matmul#
    ##############################################
-    # @exception_handler
+    @exception_handler
    def _registry_no_split_strategies_for_matmul(self, dim_partition_dict_for_batch_dim):
        # this dim partition dict only describes the batch dimensions, but in this scenario,
        # matrix dimensions are fully replicated, so it do not need extra process.
@ -205,6 +210,7 @@ class BcastOpHandler(OperatorHandler):

        self.strategies_vector.append(sharding_strategies)

+    @exception_handler
    def _split_dim_i(self, dim_partition_dict_for_batch_dim, mesh_dim_on_matrix):
        # A batched matrix multiplication can be viewed as [b, i, k] x [b, k, j] -> [b, i, j]
        # this dim partition dict describe the batch dimensions, so we should append the matrix dimension sharding info on it.
@ -262,6 +268,7 @@ class BcastOpHandler(OperatorHandler):

        self.strategies_vector.append(sharding_strategies)

+    @exception_handler
    def _split_dim_k(self, dim_partition_dict_for_batch_dim, mesh_dim_on_matrix):
        # A batched matrix multiplication can be viewed as [b, i, k] x [b, k, j] -> [b, i, j]
        # this dim partition dict describe the batch dimensions, so we should append the matrix dimension sharding info on it.
@ -325,6 +332,7 @@ class BcastOpHandler(OperatorHandler):

        self.strategies_vector.append(sharding_strategies)

+    @exception_handler
    def _split_dim_j(self, dim_partition_dict_for_batch_dim, mesh_dim_on_matrix):
        # A batched matrix multiplication can be viewed as [b, i, k] x [b, k, j] -> [b, i, j]
        # this dim partition dict describe the batch dimensions, so we should append the matrix dimension sharding info on it.
@ -390,6 +398,7 @@ class BcastOpHandler(OperatorHandler):
        self._split_dim_k(dim_partition_dict, mesh_dim_list)
        self._split_dim_j(dim_partition_dict, mesh_dim_list)

+    @exception_handler
    def _split_lhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
        dim_partition_dict_for_lhs = {-2: [mesh_dim_0], -1: [mesh_dim_1]}
        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
@ -426,6 +435,7 @@ class BcastOpHandler(OperatorHandler):

        self.strategies_vector.append(sharding_strategies)

+    @exception_handler
    def _split_rhs_space_both_contract(self, mesh_dim_0, mesh_dim_1):
        dim_partition_dict_for_lhs = {-1: [mesh_dim_0]}
        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)
@ -464,6 +474,7 @@ class BcastOpHandler(OperatorHandler):

        self.strategies_vector.append(sharding_strategies)

+    @exception_handler
    def _split_lhs_space_rhs_space(self, mesh_dim_0, mesh_dim_1):
        dim_partition_dict_for_lhs = {-2: [mesh_dim_0]}
        sharding_spec_for_lhs = self._generate_sharding_spec(self.lhs_data, dim_partition_dict_for_lhs)