[fx] supported data-dependent control flow in model tracing (#1185)

* [fx] supported data-dependent control flow in model tracing * polish code
2022-06-29 15:05:25 +08:00 · 2022-06-29 15:05:25 +08:00 · 6d86f1bc91
parent c463f8adf9
commit 6d86f1bc91
10 changed files with 461 additions and 0 deletions
--- a/colossalai/fx/init.py
+++ b/colossalai/fx/init.py
@ -0,0 +1 @@
+from .tracer import ColoTracer
--- a/colossalai/fx/proxy.py
+++ b/colossalai/fx/proxy.py
@ -37,6 +37,12 @@ class ColoProxy(Proxy):
    def _assert_has_meta(self):
        assert self.has_meta_tensor, f'Meta tensor is not set for {self.node.name}'

+    @property
+    def device(self):
+        # Hack so we can track when devices are used. During meta-tensor propagation,
+        # replace these values with a constant 'meta'
+        return MetaDeviceAttribute(self, "device")
+
    @property
    def dtype(self):
        self._assert_has_meta()
@ -72,3 +78,27 @@ class ColoProxy(Proxy):

    def __setitem__(self, indices, values):
        return self.tracer.create_proxy("call_function", operator.setitem, (self, indices, values), {})
+
+
+class ColoAttribute(ColoProxy):
+
+    def __init__(self, root, attr: str):
+        # this class is copied from torch.fx.Attribute
+        # but inherits ColoProxy
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._node = None
+
+    @property
+    def node(self):
+        if self._node is None:
+            self._node = self.tracer.create_proxy("call_function", getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy("call_method", self.attr, (self.root,) + args, kwargs)
+
+
+class MetaDeviceAttribute(ColoAttribute):
+    pass
--- a/colossalai/fx/tracer/init.py
+++ b/colossalai/fx/tracer/init.py
@ -0,0 +1 @@
+from .tracer import ColoTracer
--- a/colossalai/fx/tracer/_tracer_utils.py
+++ b/colossalai/fx/tracer/_tracer_utils.py
@ -0,0 +1,31 @@
+from typing import List, Union, Any
+from ..proxy import ColoProxy, MetaDeviceAttribute
+
+__all__ = ['is_element_in_list', 'extract_meta']
+
+
+def is_element_in_list(elements: Union[List[Any], Any], list_: List[Any]):
+    if isinstance(elements, (tuple, list, set)):
+        for ele in elements:
+            if ele not in list_:
+                return False, ele
+    else:
+        if elements not in list_:
+            return False, elements
+
+    return True, None
+
+
+def extract_meta(*args, **kwargs):
+
+    def _convert(val):
+        if isinstance(val, MetaDeviceAttribute):
+            return 'meta'
+        elif isinstance(val, ColoProxy):
+            assert val.meta_tensor is not None
+            return val.meta_tensor
+        return val
+
+    new_args = [_convert(val) for val in args]
+    new_kwargs = {k: _convert(v) for k, v in kwargs.items()}
+    return new_args, new_kwargs
--- a/colossalai/fx/tracer/meta_patch/init.py
+++ b/colossalai/fx/tracer/meta_patch/init.py
@ -0,0 +1,4 @@
+from sys import meta_path
+from .registry import *
+from .patched_function import *
+from .patched_module import *
--- a/colossalai/fx/tracer/meta_patch/patched_function.py
+++ b/colossalai/fx/tracer/meta_patch/patched_function.py
--- a/colossalai/fx/tracer/meta_patch/patched_module.py
+++ b/colossalai/fx/tracer/meta_patch/patched_module.py
@ -0,0 +1,7 @@
+import torch
+from .registry import meta_patched_module
+
+
+@meta_patched_module.register(torch.nn.Linear)
+def torch_nn_linear(self, input):
+    return torch.empty(input.shape[:-1] + (self.out_features,), device="meta")
--- a/colossalai/fx/tracer/meta_patch/registry.py
+++ b/colossalai/fx/tracer/meta_patch/registry.py
@ -0,0 +1,25 @@
+class PatchRegistry:
+
+    def __init__(self, name):
+        self.name = name
+        self.store = {}
+
+    def register(self, source):
+
+        def wrapper(func):
+            self.store[source] = func
+            return func
+
+        return wrapper
+
+    def get(self, source):
+        assert source in self.store
+        target = self.store[source]
+        return target
+
+    def has(self, source):
+        return source in self.store
+
+
+meta_patched_function = PatchRegistry(name='patched_functions_for_meta_execution')
+meta_patched_module = PatchRegistry(name='patched_modules_for_meta_execution')
--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@ -0,0 +1,305 @@
+#!/usr/bin/env python
+"""
+tracer.py: 
+    Implemented a tracer which supports control flow and user-defined meta arguments.
+    The implementation is partly inspired HuggingFace's fx tracer
+"""
+
+import inspect
+import math
+import functools
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.fx import Tracer
+from torch.fx.graph import Graph
+from torch.fx.proxy import Proxy, ParameterProxy
+from torch.utils import _pytree
+from ..proxy import ColoProxy
+from typing import Optional, Dict, Any
+from ._tracer_utils import is_element_in_list, extract_meta
+from .meta_patch import meta_patched_function, meta_patched_module
+
+__all__ = ['ColoTracer']
+
+
+class ColoTracer(Tracer):
+    """
+    ColoTracer is a symbolic tracer designed to support dynamic control flow by using meta tensors for the `colossalai.fx` module.
+    This tracer is initialized in the same way as the original torch.fx.Tracer.
+
+    Usage:
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = nn.Linear(10, 10)
+                self.linear2 = nn.Linear(10, 10)
+
+            def forward(self, x, y):
+                x1 = self.linear1(x)
+                y1 = self.linear2(y)
+
+                if x1.dim() == 2:
+                    return x1 + y1
+                else:
+                    return x1 - y1
+
+        model = Model()
+        tracer = ColoTracer()
+        graph = tracer.trace(model, concrete_args={'y': torch.rand(4, 10)}, meta_args={'x': torch.rand(4, 10, device='meta')})
+    """
+
+    # Feature flag for proxying accesses to buffer values
+    proxy_buffer_attributes: bool = True
+
+    _TORCH_METHODS_TO_PATCH = ["arange", "zeros", "ones", "full", "full_like", "eye", "empty", "tensor"]
+
+    def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None) -> ColoProxy:
+        """
+        Create a proxy for different kinds of operations.
+        """
+        proxy = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
+        proxy: ColoProxy
+
+        if kind == "placeholder" and target in self.meta_args and self.meta_args[target].is_meta:
+            proxy.meta_tensor = self.meta_args[target]
+            return proxy
+
+        if target in self.orig_torch_tensor_methods:
+            # NOTE: tensor constructors in PyTorch define the `device` argument as
+            # *kwargs-only*. That is why this works. If you add methods to
+            # _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only,
+            # this will break and you will likely see issues where we cannot infer
+            # the size of the output.
+            if "device" in kwargs:
+                kwargs["device"] = "meta"
+
+        try:
+            args_metas, kwargs_metas = extract_meta(*args, **kwargs)
+
+            if kind == "call_function":
+                # fetch patched function
+                if meta_patched_function.has(target):
+                    meta_target = meta_patched_function.get(target)
+                else:
+                    meta_target = target
+
+                meta_out = meta_target(*args_metas, **kwargs_metas)
+                if isinstance(meta_out, torch.Tensor):
+                    meta_out = meta_out.to(device="meta")
+            elif kind == "call_method":
+                method = getattr(args_metas[0].__class__, target)
+
+                # fetch patched method
+                if meta_patched_function.has(method):
+                    meta_target = meta_patched_function.get(method)
+                else:
+                    meta_target = method
+
+                meta_out = meta_target(*args_metas, **kwargs_metas)
+            elif kind == "call_module":
+                if not hasattr(self, "orig_forward"):
+                    raise AttributeError(f"{self} does not have an attribute called orig_forward")
+                self._disable_module_getattr = True
+                try:
+                    mod = self.root.get_submodule(target)
+                    mod_type = type(mod)
+                    if meta_patched_module.has(mod_type):
+                        meta_out = meta_patched_module.get(mod_type)(mod, *args_metas, **kwargs_metas)
+                    else:
+                        meta_out = self.orig_forward(*args_metas, **kwargs_metas)
+                finally:
+                    self._disable_module_getattr = False
+            elif kind == "get_attr":
+                self._disable_module_getattr = True
+                try:
+                    attr_itr = self.root
+                    atoms = target.split(".")
+                    for atom in atoms:
+                        attr_itr = getattr(attr_itr, atom)
+                    if isinstance(attr_itr, torch.Tensor):
+                        meta_out = attr_itr.to(device="meta")
+                    else:
+                        meta_out = attr_itr
+                finally:
+                    self._disable_module_getattr = False
+            else:
+                return proxy
+
+            if not isinstance(proxy, Proxy):
+                raise ValueError("Don't support composite output yet")
+            proxy.meta_tensor = meta_out
+        except Exception as e:
+            raise RuntimeError(f"Could not compute metadata for {kind} target {target}: {e}")
+        return proxy
+
+    def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
+        if getattr(self, "_disable_module_getattr", False):
+            return attr_val
+        else:
+            # return super()._module_getattr(attr, attr_val, parameter_proxy_cache)
+            def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache):
+                for n, p in collection_to_search:
+                    if attr_val is p:
+                        if n not in parameter_proxy_cache:
+                            kwargs = {}
+                            if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters:
+                                kwargs["proxy_factory_fn"] = (None if not self.param_shapes_constant else
+                                                              lambda node: ParameterProxy(self, node, n, attr_val))
+                            val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs)    # type: ignore[arg-type]
+                            parameter_proxy_cache[n] = val_proxy
+                        return parameter_proxy_cache[n]
+                return None
+
+            if isinstance(attr_val, torch.nn.Parameter):
+                maybe_parameter_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_parameters(),
+                                                                 parameter_proxy_cache)
+                if maybe_parameter_proxy is not None:
+                    return maybe_parameter_proxy
+
+            if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
+                maybe_buffer_proxy = maybe_get_proxy_for_attr(attr_val, self.root.named_buffers(),
+                                                              parameter_proxy_cache)
+                if maybe_buffer_proxy is not None:
+                    return maybe_buffer_proxy
+
+            return attr_val
+
+    def call_module(self, m, forward, args, kwargs):
+        self.orig_forward = forward
+        return super().call_module(m, forward, args, kwargs)
+
+    def proxy(self, node) -> ColoProxy:
+        """
+        Returns a ColoProxy object.
+        """
+        return ColoProxy(node, self)
+
+    def trace(self,
+              root: nn.Module,
+              concrete_args: Optional[Dict[str, Tensor]] = None,
+              meta_args: Optional[Dict[str, Tensor]] = None) -> Graph:
+        """
+        Trace the forward computation graph using `torch.fx.Tracer`. This tracer enables data-dependent control flow.
+
+        Args:
+            root (nn.Module): a `nn.Module` object to trace the computation graph
+            meta_args (Optional[Dict[str, Tensor]]): the meta tensor arguments used to trace the computation graph. 
+                These arguments are the sample data fed to the model during actual computation, but just converted to meta tensors.
+            concrete_args (Optional[Dict[str, Tensor]]): the concrete arguments that should not be treated as Proxies.
+        """
+        if meta_args is None:
+            meta_args = {}
+
+        if concrete_args is None:
+            concrete_args = {}
+
+        # check concrete and meta args have valid names
+        sig = inspect.signature(root.forward)
+        sig_names = set(sig.parameters.keys())
+        meta_arg_names = set(meta_args.keys())
+        concrete_arg_names = set(concrete_args.keys())
+        non_concrete_arg_names = sig_names - concrete_arg_names
+
+        def _check_arg_name_valid(names):
+            success, element = is_element_in_list(names, sig_names)
+            if not success:
+                raise KeyError(
+                    f"argument {element} is not found in the signature of {root.__class__.__name__}'s forward function")
+
+        _check_arg_name_valid(meta_arg_names)
+        _check_arg_name_valid(concrete_arg_names)
+
+        # assign as attributed for late reference
+        def _check_kwargs(kwargs, should_be_meta: bool):
+            for k, v in kwargs.items():
+                assert v.is_meta == should_be_meta, \
+                    f'expected the is_meta attribute of {k} to be {should_be_meta}, but got {v.is_meta}, please check the args passed to the tracer'
+
+        _check_kwargs(concrete_args, should_be_meta=False)
+        _check_kwargs(meta_args, should_be_meta=True)
+
+        self.concrete_args = concrete_args
+        self.meta_args = meta_args
+
+        # wrap the torch tensor constructing methods so that they are captured in the graph
+        self.patched_torch_tensor_methods = {
+            target: wrap_tensor_constructor_method(getattr(torch, target)) for target in self._TORCH_METHODS_TO_PATCH
+        }
+
+        # patch these methods to replace their original use
+        for name, (wrapper, orig) in self.patched_torch_tensor_methods.items():
+            setattr(torch, name, wrapper)
+
+        # cache these methods so that we can detect whether a method call
+        # should be patched during tracing
+        self.orig_torch_tensor_methods = [val[1] for val in self.patched_torch_tensor_methods.values()]
+
+        try:
+            self.graph = super().trace(root, concrete_args=concrete_args)
+        finally:
+            # recover the patched methods
+            for name, (_, orig) in self.patched_torch_tensor_methods.items():
+                setattr(torch, name, orig)
+
+        # This is necessary because concrete args are added as input to the traced module since
+        # https://github.com/pytorch/pytorch/pull/55888.
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                # Removing default values for inputs as the forward pass will fail with them.
+                if node.target in non_concrete_arg_names:
+                    node.args = ()
+                    # Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor].
+                    # It cannot infer on the attributes and methods the input should have, and fails.
+                    node.type = torch.Tensor
+                # It is a concrete arg so it is not used and should be removed.
+                else:
+                    if hasattr(torch.fx._symbolic_trace, "_assert_is_none"):
+                        # Newer versions of torch.fx emit an assert statement
+                        # for concrete arguments; delete those before we delete
+                        # the concrete arg.
+                        to_delete = []
+                        for user in node.users:
+                            if user.target == torch.fx._symbolic_trace._assert_is_none:
+                                to_delete.append(user)
+                        for user in to_delete:
+                            self.graph.erase_node(user)
+
+                    self.graph.erase_node(node)
+
+            # TODO: solves GraphModule creation.
+            # Without this, return type annotation "Tuple" is causing code execution failure.
+            if node.op == "output":
+                node.type = None
+
+        return self.graph
+
+
+def wrap_tensor_constructor_method(target):
+
+    def look_for_proxy(*args, **kwargs):
+        # find in pos vars
+        for arg in args:
+            if isinstance(arg, Proxy):
+                return arg
+
+        # find in keyword vars
+        for k, v in kwargs.items():
+            if isinstance(v, Proxy):
+                return v
+        return None
+
+    @functools.wraps(target)
+    def wrapper(*args, **kwargs):
+        proxy = look_for_proxy(*args, **kwargs)
+
+        if proxy is not None:
+            # if the arg is a proxy, then need to record this function called on this proxy
+            # e.g. torch.ones(size) where size is an input proxy
+            return proxy.tracer.create_proxy("call_function", target, args, kwargs)
+        else:
+            # this is called directly when the inputs do not contain proxy
+            # e.g. torch.ones(4) where the input is static
+            return target(*args, **kwargs)
+
+    return wrapper, target
--- a/tests/test_fx/test_tracer/test_control_flow.py
+++ b/tests/test_fx/test_tracer/test_control_flow.py
@ -0,0 +1,57 @@
+import torch
+import torch.nn as nn
+from torch.fx import GraphModule
+from colossalai.fx import ColoTracer as Tracer
+
+
+class ControlFlowModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear1 = nn.Linear(10, 10)
+        self.linear2 = nn.Linear(10, 10)
+
+    def forward(self, x, y):
+        x1 = self.linear1(x)
+        y1 = self.linear2(y)
+
+        if x1.dim() == 2:
+            return x1 + y1
+        else:
+            return x1 - y1
+
+
+def test_control_flow():
+    model = ControlFlowModel()
+    tracer = Tracer()
+    graph_branch_true = tracer.trace(model,
+                                     meta_args={
+                                         'x': torch.rand(4, 10, device='meta'),
+                                         'y': torch.rand(4, 10, device='meta')
+                                     })
+    graph_branch_false = tracer.trace(model,
+                                      meta_args={
+                                          'x': torch.rand(10, device='meta'),
+                                          'y': torch.rand(4, 10, device='meta')
+                                      })
+
+    gm_branch_true = GraphModule(model, graph_branch_true, model.__class__.__name__)
+    gm_branch_false = GraphModule(model, graph_branch_false, model.__class__.__name__)
+    gm_branch_true.recompile()
+    gm_branch_false.recompile()
+
+    # test the true branch
+    x = torch.rand(4, 10)
+    y = torch.rand(4, 10)
+    assert torch.all(model(x, y) == gm_branch_true(x, y))
+    assert torch.all(gm_branch_false(x, y) != gm_branch_true(x, y))
+
+    # test the true branch
+    x = torch.rand(10)
+    y = torch.rand(4, 10)
+    assert torch.all(model(x, y) == gm_branch_false(x, y))
+    assert torch.all(gm_branch_false(x, y) != gm_branch_true(x, y))
+
+
+if __name__ == '__main__':
+    test_control_flow()