[autoparallel] Add metainfo support for F.linear (#1987)

* [fx] metainfo class for auto parallel * [fx] add unit test for linear metainfo * [fx] fix bwd param for linear * [fx] modify unit test * [fx] modify unit test * [fx] modify import * [fx] modify import * [fx] modify import * [fx] move meta profiler to auto parallel * [fx] add conv metainfo class * [fx] restore profiler * [fx] restore meta profiler * [autoparallel] modify unit test * [fx] modify unit test * [autoparallel] add batchnorm metainfo class * [autoparallel] fix batchnorm unit test function declaration * [fx] restore profiler * [fx] add relu metainfo class * [fx] restore profiler * [autoparallel] modify metainfo input * [autoparallel] add pooling metainfo * [autoparallel] add F.linear metainfo generator
2 years ago · 6cd784ffee
parent 2edbef13cc
commit 6cd784ffee
4 changed files with 82 additions and 15 deletions
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
@ -19,10 +19,13 @@ from ..registry import meta_register
 __all__ = ['linear_meta_info']
@meta_register.register(torch.nn.functional.linear)
@meta_register.register(torch.nn.Linear)
 def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
-    """torch.nn.Linear meta info generator
+    """torch.nn.Linear & torch.nn.functional.linear meta info generator
-    The atens graph of torch.nn.Linear with bias is
+    NOTE: currently we separate the bias part from the biased linear ops, we will consider the memory consumption in add metainfo generator,
    but we will hold the bias mechanism in the linear metainfo generator for future use.
    graph():
    %input_2 : [#users=2] = placeholder[target=placeholder](default=)
    %addmm_default : [#users=1] = call_function[target=torch.ops.aten.addmm.default](args = (None, %input_2, None), kwargs = {})
@ -65,7 +68,7 @@ def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
    has_bias: bool = False
    input_tensor = next(filter(lambda x: x.type == OperationDataType.ARG, args)).data
    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
-    weight_tensor = next(filter(lambda x: x.name == 'weight', args)).data
+    weight_tensors = [x.data for x in args if x.type == OperationDataType.PARAM]
    # process the dimension of input and output
    if len(input_tensor.shape) > 2:
@ -76,9 +79,14 @@ def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
        output_tensor: torch.Tensor
        output_tensor = output_tensor.view(-1, output_tensor.shape[-1])
-    if len(args) == 4:
+    if len(weight_tensors) > 1:
        bias_tensor = next(filter(lambda x: x.name == 'bias', args)).data
        has_bias = True
        if len(weight_tensors[0].shape) == 2:
            weight_tensor, bias_tensor = weight_tensors
        else:
            bias_tensor, weight_tensor = weight_tensors
    else:
        weight_tensor = weight_tensors[0]
    if has_bias:
        # calculate cost with bias
--- a/colossalai/auto_parallel/meta_profiler/metainfo.py
+++ b/colossalai/auto_parallel/meta_profiler/metainfo.py
@ -92,8 +92,12 @@ class MetaInfo:
        Compute meta info based on sharding strategy and the given target function.
        """
-        assert meta_register.has(self._target.__class__), f'{self._target.__class__} not found in the meta registry'
+        try:
-        meta_func = meta_register.get(self._target.__class__)
+            # module
            meta_func = meta_register.get(self._target.__class__)
        except:
            # function
            meta_func = meta_register.get(self._target)
        # construct args for meta_func
        args = [self.compute_sharded_tensor(k, v) for k, v in self._strategy.sharding_specs.items()]
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_linear_metainfo.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/test_linear_metainfo.py
@ -20,7 +20,17 @@ if torch.__version__ >= '1.12.0':
    from colossalai.auto_parallel.meta_profiler import MetaInfo, meta_register
-def _linear_module_mem_test(rank, bias, world_size, port):
+class MyModule(nn.Module):
    def __init__(self, in_features=64, out_features=128):
        super().__init__()
        self.fc_weight = nn.Parameter(torch.randn(out_features, in_features))
    def forward(self, input):
        return nn.functional.linear(input, self.fc_weight)
 def _linear_module_mem_test(rank, world_size, port):
    """This function is for linear memory test
    Test and print real memory cost and estimated, this test will not be executed except with the tag AUTO_PARALLEL
@ -32,7 +42,7 @@ def _linear_module_mem_test(rank, bias, world_size, port):
    """
    disable_existing_loggers()
    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    model = nn.Sequential(nn.Linear(64, 128, bias=bias)).cuda()
+    model = nn.Sequential(nn.Linear(64, 128, bias=False)).cuda()
    input = torch.rand(8, 8, 16, 64).cuda()
    input.requires_grad = True
    physical_mesh_id = torch.arange(0, 4)
@ -52,11 +62,50 @@ def _linear_module_mem_test(rank, bias, world_size, port):
@run_on_environment_flag(name='AUTO_PARALLEL')
@pytest.mark.dist
@rerun_if_address_is_in_use()
-def test_linear_meta_concrete_info_match(bias=False):
+def test_linear_module_meta_concrete_info_match():
    world_size = 4
    run_func_module = partial(_linear_module_mem_test, world_size=world_size, port=free_port())
    mp.spawn(run_func_module, nprocs=world_size)
 def _linear_function_mem_test(rank, world_size, port):
    """This function is for linear memory test
    Test and print real memory cost and estimated, this test will not be executed except with the tag AUTO_PARALLEL
    Args:
        rank: device rank
        bias: indicate whether linear module need bias
        world_size: number of devices
        port: port for initializing process group
    """
    disable_existing_loggers()
    launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    model = MyModule().cuda()
    input = torch.rand(8, 8, 16, 64).cuda()
    input.requires_grad = True
    physical_mesh_id = torch.arange(0, 4)
    mesh_shape = (2, 2)
    device_mesh = DeviceMesh(physical_mesh_id, mesh_shape, init_process_group=True)
    # memory test
    mem_test_for_node_strategy(rank=rank,
                               model=model,
                               device_mesh=device_mesh,
                               node_index=2,
                               strategy_number=13,
                               input_args=[input],
                               meta_arg_names=["input"])
@run_on_environment_flag(name='AUTO_PARALLEL')
@pytest.mark.dist
@rerun_if_address_is_in_use()
 def test_linear_function_meta_concrete_info_match():
    world_size = 4
-    run_func_module = partial(_linear_module_mem_test, bias=bias, world_size=world_size, port=free_port())
+    run_func_module = partial(_linear_function_mem_test, world_size=world_size, port=free_port())
    mp.spawn(run_func_module, nprocs=world_size)
 if __name__ == '__main__':
-    test_linear_meta_concrete_info_match()
+    # test_linear_module_meta_concrete_info_match()
    test_linear_function_meta_concrete_info_match()
--- a/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
+++ b/tests/test_auto_parallel/test_tensor_shard/test_metainfo/utils.py
@ -7,6 +7,7 @@ from torch.fx import GraphModule
 from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply_pass
 from colossalai.auto_parallel.passes.runtime_preparation_pass import runtime_preparation_pass
 from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationDataType
 from colossalai.auto_parallel.tensor_shard.solver import SolverOptions, StrategiesConstructor
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.fx.tracer.tracer import ColoTracer
@ -49,8 +50,9 @@ def mem_test_for_node_strategy(rank: int,
        # construct the strategy for the output node
        placeholder_strategy = list(graph.nodes)[-1].strategies_vector[0]
        output_key = next(key for key in target_node.strategies_vector[strategy_index].sharding_specs.keys()
-                          if key in placeholder_strategy.sharding_specs)
+                          if key.type == OperationDataType.OUTPUT)
        placeholder_strategy.sharding_specs[output_key] = target_node.strategies_vector[strategy_index].sharding_specs[
            output_key]
@ -104,8 +106,12 @@ def mem_test_for_node_strategy(rank: int,
            )
            # estimated memory
-            metainfo = MetaInfo(target_node.strategies_vector[strategy_index],
+            if target_node.op == "call_module":
-                                target_node.graph.owning_module.get_submodule(target_node.target))
+                metainfo = MetaInfo(target_node.strategies_vector[strategy_index],
                                    target_node.graph.owning_module.get_submodule(target_node.target))
            else:
                metainfo = MetaInfo(target_node.strategies_vector[strategy_index], target_node.target)
            print("estimated memory:")
            print(
                f"forward activation: {metainfo.memory_cost.fwd.activation / 1024} kb, forward param: {metainfo.memory_cost.fwd.parameter / 1024} kb"