mirror of https://github.com/hpcaitech/ColossalAI
564 lines
24 KiB
Python
564 lines
24 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
tracer.py:
|
|
Implemented a tracer which supports control flow and user-defined meta arguments.
|
|
The implementation is partly inspired HuggingFace's fx tracer
|
|
"""
|
|
import enum
|
|
import functools
|
|
import inspect
|
|
import operator
|
|
from contextlib import contextmanager
|
|
from typing import Any, Dict, Optional
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
from torch import Tensor
|
|
from torch.fx import Node, Tracer
|
|
from torch.fx.graph import Graph, magic_methods, reflectable_magic_methods
|
|
from torch.fx.proxy import ParameterProxy, Proxy
|
|
|
|
from ..proxy import ColoProxy
|
|
from ._tracer_utils import compute_meta_data_for_functions_proxy, extract_meta, is_element_in_list
|
|
from .bias_addition_patch import func_to_func_dict, method_to_func_dict, module_to_func_dict
|
|
from .registry import (
|
|
bias_addition_function,
|
|
bias_addition_method,
|
|
bias_addition_module,
|
|
meta_patched_function,
|
|
meta_patched_module,
|
|
)
|
|
|
|
__all__ = ["ColoTracer"]
|
|
|
|
|
|
class TracerType(enum.Enum):
|
|
DEFAULT = 1
|
|
META = 2
|
|
|
|
|
|
class ColoTracer(Tracer):
|
|
"""
|
|
ColoTracer is a symbolic tracer designed to support dynamic control flow by using meta tensors for the `colossalai.fx` module.
|
|
This tracer is initialized in the same way as the original torch.fx.Tracer.
|
|
|
|
Usage::
|
|
|
|
class Model(nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.linear1 = nn.Linear(10, 10)
|
|
self.linear2 = nn.Linear(10, 10)
|
|
|
|
def forward(self, x, y):
|
|
x1 = self.linear1(x)
|
|
y1 = self.linear2(y)
|
|
|
|
if x1.dim() == 2:
|
|
return x1 + y1
|
|
else:
|
|
return x1 - y1
|
|
|
|
model = Model()
|
|
tracer = ColoTracer()
|
|
graph = tracer.trace(model, concrete_args={'y': torch.rand(4, 10)}, meta_args={'x': torch.rand(4, 10, device='meta')})
|
|
"""
|
|
|
|
def __init__(self, trace_act_ckpt: bool = False, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.tracer_type = TracerType.META
|
|
self.proxy_cls = ColoProxy
|
|
|
|
# whether the tracer will record the usage of torch.utils.checkpoint
|
|
self.trace_act_ckpt = trace_act_ckpt
|
|
# whether the current tracing occurs within the activation checkpoint functions
|
|
self.inside_torch_checkpoint_func = False
|
|
self.act_ckpt_region_count = 0
|
|
|
|
# Feature flag for proxying accesses to buffer values
|
|
proxy_buffer_attributes: bool = True
|
|
|
|
_TORCH_METHODS_TO_PATCH = ["arange", "zeros", "ones", "full", "full_like", "eye", "empty", "tensor", "finfo"]
|
|
|
|
def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None) -> ColoProxy:
|
|
"""
|
|
Create a proxy for different kinds of operations.
|
|
"""
|
|
|
|
if self.tracer_type == TracerType.DEFAULT:
|
|
# since meta_args is not given
|
|
# we just fall back to the original torch.fx.Tracer
|
|
proxy = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
|
|
return proxy
|
|
|
|
# if graph is traced for auto parallelism module, some extra node will be added during
|
|
# graph construction to deal with the compatibility between bias addition and all reduce.
|
|
|
|
# if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
|
|
# to create node on computation graph
|
|
origin_arguments = (kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
|
|
# dispatch the arguments generator depending on the kind and target in origin arguments.
|
|
args_metas, _ = extract_meta(*args, **kwargs)
|
|
handle = None
|
|
if kind == "call_function":
|
|
if bias_addition_function.has(target):
|
|
if target == torch.nn.functional.linear:
|
|
if "bias" in kwargs and kwargs["bias"] is not None:
|
|
function_to_substitute = func_to_func_dict[target]
|
|
handle = bias_addition_function.get(target)(self, target, args, kwargs, function_to_substitute)
|
|
else:
|
|
function_to_substitute = func_to_func_dict[target]
|
|
handle = bias_addition_function.get(target)(self, target, args, kwargs, function_to_substitute)
|
|
elif bias_addition_function.has(target.__name__):
|
|
# use name for some builtin op like @ (matmul)
|
|
function_to_substitute = func_to_func_dict[target]
|
|
handle = bias_addition_function.get(target.__name__)(self, target, args, kwargs, function_to_substitute)
|
|
|
|
elif kind == "call_method":
|
|
method = getattr(args_metas[0].__class__, target)
|
|
if bias_addition_method.has(method):
|
|
function_to_substitute = method_to_func_dict[method]
|
|
handle = bias_addition_method.get(method)(self, target, args, kwargs, function_to_substitute)
|
|
|
|
elif kind == "call_module":
|
|
if not hasattr(self, "orig_forward"):
|
|
raise AttributeError(f"{self} does not have an attribute called orig_forward")
|
|
self._disable_module_getattr = True
|
|
try:
|
|
mod = self.root.get_submodule(target)
|
|
mod_type = type(mod)
|
|
if bias_addition_module.has(mod_type) and mod.bias is not None:
|
|
function_to_substitute = module_to_func_dict[mod_type]
|
|
handle = bias_addition_module.get(mod_type)(self, target, args, kwargs, function_to_substitute)
|
|
finally:
|
|
self._disable_module_getattr = False
|
|
|
|
if handle is not None:
|
|
return handle.generate()
|
|
|
|
# create nodes using patched arguments
|
|
proxy = super().create_proxy(*origin_arguments)
|
|
proxy: ColoProxy
|
|
meta_out = self._meta_data_computing(
|
|
kind,
|
|
target,
|
|
args,
|
|
kwargs,
|
|
)
|
|
proxy.meta_data = meta_out
|
|
|
|
return proxy
|
|
|
|
def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
|
|
if getattr(self, "_disable_module_getattr", False):
|
|
return attr_val
|
|
else:
|
|
# return super()._module_getattr(attr, attr_val, parameter_proxy_cache)
|
|
def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache):
|
|
for n, p in collection_to_search:
|
|
if attr_val is p:
|
|
if n not in parameter_proxy_cache:
|
|
kwargs = {}
|
|
if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters:
|
|
kwargs["proxy_factory_fn"] = (
|
|
None
|
|
if not self.param_shapes_constant
|
|
else lambda node: ParameterProxy(self, node, n, attr_val)
|
|
)
|
|
val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs) # type: ignore[arg-type]
|
|
parameter_proxy_cache[n] = val_proxy
|
|
return parameter_proxy_cache[n]
|
|
return None
|
|
|
|
if isinstance(attr_val, torch.nn.Parameter):
|
|
maybe_parameter_proxy = maybe_get_proxy_for_attr(
|
|
attr_val, self.root.named_parameters(), parameter_proxy_cache
|
|
)
|
|
if maybe_parameter_proxy is not None:
|
|
return maybe_parameter_proxy
|
|
|
|
if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
|
|
maybe_buffer_proxy = maybe_get_proxy_for_attr(
|
|
attr_val, self.root.named_buffers(), parameter_proxy_cache
|
|
)
|
|
if maybe_buffer_proxy is not None:
|
|
return maybe_buffer_proxy
|
|
|
|
return attr_val
|
|
|
|
def call_module(self, m, forward, args, kwargs):
|
|
self.orig_forward = forward
|
|
module_qualified_name = self.path_of_module(m)
|
|
|
|
# a leaf module is the torch.nn.Module subclasses starting with `torch.nn`
|
|
# which means customized modules are not leaf module by default
|
|
# if a customized or third-party module like apex.normalization.FusedRMSNorm is patched,
|
|
# we should treat it as leaf module as well
|
|
if meta_patched_module.has(m.__class__) or self.is_leaf_module(m, module_qualified_name):
|
|
return self.create_proxy("call_module", module_qualified_name, args, kwargs)
|
|
else:
|
|
return forward(*args, **kwargs)
|
|
|
|
def proxy(self, node) -> Proxy:
|
|
"""
|
|
Returns a ColoProxy object.
|
|
"""
|
|
return self.proxy_cls(node, self)
|
|
|
|
def _configure_tracer_type(self, tracer_type: TracerType):
|
|
if tracer_type == TracerType.DEFAULT:
|
|
self.proxy_cls = Proxy
|
|
self.tracer_type = TracerType.DEFAULT
|
|
elif tracer_type == TracerType.META:
|
|
self.proxy_cls = ColoProxy
|
|
self.tracer_type = TracerType.META
|
|
else:
|
|
raise ValueError(f"Unrecognized tracer type {tracer_type}")
|
|
|
|
def _meta_data_computing(self, kind, target, args, kwargs):
|
|
if kind == "placeholder" and target in self.meta_args and self.meta_args[target].is_meta:
|
|
meta_out = self.meta_args[target]
|
|
return meta_out
|
|
|
|
if target in self.orig_torch_tensor_methods:
|
|
# NOTE: tensor constructors in PyTorch define the `device` argument as
|
|
# *kwargs-only*. That is why this works. If you add methods to
|
|
# _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only,
|
|
# this will break and you will likely see issues where we cannot infer
|
|
# the size of the output.
|
|
if "device" in kwargs:
|
|
kwargs["device"] = "meta"
|
|
|
|
try:
|
|
args_metas, kwargs_metas = extract_meta(*args, **kwargs)
|
|
|
|
if kind == "call_function":
|
|
# Our meta data will not record the nn.parameter.Parameter attribute。
|
|
# It works fine in most of the case, but it may cause some problems after
|
|
# the bias addition manipulation.
|
|
# Therefore, I need to record the nn.parameter.Parameter attribute for the operation
|
|
# added by the bias addition manipulation following the get_attr node.
|
|
convert_to_parameter = False
|
|
if target in (torch.transpose, torch.reshape) and isinstance(
|
|
args_metas[0], torch.nn.parameter.Parameter
|
|
):
|
|
convert_to_parameter = True
|
|
# fetch patched function
|
|
if meta_patched_function.has(target):
|
|
meta_target = meta_patched_function.get(target)
|
|
elif meta_patched_function.has(target.__name__):
|
|
# use name for some builtin op like @ (matmul)
|
|
meta_target = meta_patched_function.get(target.__name__)
|
|
else:
|
|
meta_target = target
|
|
|
|
meta_out = meta_target(*args_metas, **kwargs_metas)
|
|
if isinstance(meta_out, torch.Tensor):
|
|
meta_out = meta_out.to(device="meta")
|
|
if convert_to_parameter:
|
|
meta_out = torch.nn.Parameter(meta_out)
|
|
|
|
elif kind == "call_method":
|
|
# Our meta data will not record the nn.parameter.Parameter attribute。
|
|
# It works fine in most of the case, but it may cause some problems after
|
|
# the bias addition manipulation.
|
|
# Therefore, I need to record the nn.parameter.Parameter attribute for the operation
|
|
# added by the bias addition manipulation following the get_attr node.
|
|
convert_to_parameter = False
|
|
if target in (torch.Tensor.view,) and isinstance(args_metas[0], torch.nn.parameter.Parameter):
|
|
convert_to_parameter = True
|
|
method = getattr(args_metas[0].__class__, target)
|
|
|
|
# fetch patched method
|
|
if meta_patched_function.has(method):
|
|
meta_target = meta_patched_function.get(method)
|
|
else:
|
|
meta_target = method
|
|
|
|
meta_out = meta_target(*args_metas, **kwargs_metas)
|
|
if convert_to_parameter:
|
|
meta_out = torch.nn.Parameter(meta_out)
|
|
elif kind == "call_module":
|
|
if not hasattr(self, "orig_forward"):
|
|
raise AttributeError(f"{self} does not have an attribute called orig_forward")
|
|
self._disable_module_getattr = True
|
|
try:
|
|
mod = self.root.get_submodule(target)
|
|
mod_type = type(mod)
|
|
if meta_patched_module.has(mod_type):
|
|
meta_out = meta_patched_module.get(mod_type)(mod, *args_metas, **kwargs_metas)
|
|
else:
|
|
meta_out = self.orig_forward(*args_metas, **kwargs_metas)
|
|
finally:
|
|
self._disable_module_getattr = False
|
|
elif kind == "get_attr":
|
|
self._disable_module_getattr = True
|
|
try:
|
|
attr_itr = self.root
|
|
atoms = target.split(".")
|
|
for atom in atoms:
|
|
attr_itr = getattr(attr_itr, atom)
|
|
if isinstance(attr_itr, torch.nn.parameter.Parameter):
|
|
meta_out = torch.nn.Parameter(attr_itr.to(device="meta"))
|
|
elif isinstance(attr_itr, torch.Tensor):
|
|
meta_out = attr_itr.to(device="meta")
|
|
else:
|
|
meta_out = attr_itr
|
|
finally:
|
|
self._disable_module_getattr = False
|
|
else:
|
|
return None
|
|
|
|
except Exception as e:
|
|
raise RuntimeError(f"Could not compute metadata for {kind} target {target}: {e}")
|
|
|
|
return meta_out
|
|
|
|
def trace(
|
|
self,
|
|
root: nn.Module,
|
|
concrete_args: Optional[Dict[str, Tensor]] = None,
|
|
meta_args: Optional[Dict[str, Tensor]] = None,
|
|
) -> Graph:
|
|
"""
|
|
Trace the forward computation graph using `torch.fx.Tracer`. This tracer enables data-dependent control flow.
|
|
|
|
Args:
|
|
root (nn.Module): a `nn.Module` object to trace the computation graph
|
|
meta_args (Optional[Dict[str, Tensor]]): the meta tensor arguments used to trace the computation graph.
|
|
These arguments are the sample data fed to the model during actual computation, but just converted to meta tensors.
|
|
concrete_args (Optional[Dict[str, Tensor]]): the concrete arguments that should not be treated as Proxies.
|
|
"""
|
|
if meta_args is None:
|
|
meta_args = {}
|
|
|
|
if concrete_args is None:
|
|
concrete_args = {}
|
|
|
|
if len(meta_args) == 0:
|
|
self._configure_tracer_type(TracerType.DEFAULT)
|
|
else:
|
|
self._configure_tracer_type(TracerType.META)
|
|
|
|
# check concrete and meta args have valid names
|
|
sig = inspect.signature(root.forward)
|
|
sig_names = set(sig.parameters.keys())
|
|
meta_arg_names = set(meta_args.keys())
|
|
|
|
# update concrete args with default values
|
|
non_meta_arg_names = sig_names - meta_arg_names
|
|
for k, v in sig.parameters.items():
|
|
if k in non_meta_arg_names and k not in concrete_args and v.default is not inspect.Parameter.empty:
|
|
concrete_args[k] = v.default
|
|
|
|
# get non concrete arg names
|
|
concrete_arg_names = set(concrete_args.keys())
|
|
non_concrete_arg_names = sig_names - concrete_arg_names
|
|
|
|
def _check_arg_name_valid(names):
|
|
success, element = is_element_in_list(names, sig_names)
|
|
if not success:
|
|
raise KeyError(
|
|
f"argument {element} is not found in the signature of {root.__class__.__name__}'s forward function"
|
|
)
|
|
|
|
_check_arg_name_valid(meta_arg_names)
|
|
_check_arg_name_valid(concrete_arg_names)
|
|
|
|
# assign as attributed for late reference
|
|
def _check_kwargs(kwargs, should_be_meta: bool):
|
|
for k, v in kwargs.items():
|
|
if not should_be_meta:
|
|
assert (
|
|
not torch.is_tensor(v) or not v.is_meta
|
|
), f"Expected the {k} not to be a meta tensor, please check the args passed to the tracer"
|
|
else:
|
|
assert (
|
|
v.is_meta == should_be_meta
|
|
), f"Expected the is_meta attribute of {k} to be {should_be_meta}, but got {v.is_meta}, please check the args passed to the tracer"
|
|
|
|
_check_kwargs(concrete_args, should_be_meta=False)
|
|
_check_kwargs(meta_args, should_be_meta=True)
|
|
|
|
self.concrete_args = concrete_args
|
|
self.meta_args = meta_args
|
|
|
|
self.patched_torch_tensor_methods = {}
|
|
if self.tracer_type == TracerType.META:
|
|
# wrap the torch tensor constructing methods so that they are captured in the graph
|
|
self.patched_torch_tensor_methods = {
|
|
target: wrap_tensor_constructor_method(getattr(torch, target))
|
|
for target in self._TORCH_METHODS_TO_PATCH
|
|
}
|
|
|
|
# patch these methods to replace their original use
|
|
for name, (wrapper, orig) in self.patched_torch_tensor_methods.items():
|
|
setattr(torch, name, wrapper)
|
|
|
|
# cache these methods so that we can detect whether a method call
|
|
# should be patched during tracing
|
|
self.orig_torch_tensor_methods = [val[1] for val in self.patched_torch_tensor_methods.values()]
|
|
|
|
try:
|
|
# to track the usage of torch.utils.checkpoint
|
|
with self.trace_activation_checkpoint(enabled=self.trace_act_ckpt):
|
|
self.graph = super().trace(root, concrete_args=concrete_args)
|
|
|
|
finally:
|
|
# recover the patched methods
|
|
for name, (_, orig) in self.patched_torch_tensor_methods.items():
|
|
setattr(torch, name, orig)
|
|
|
|
if self.tracer_type == TracerType.DEFAULT:
|
|
return self.graph
|
|
|
|
# This is necessary because concrete args are added as input to the traced module since
|
|
# https://github.com/pytorch/pytorch/pull/55888.
|
|
for node in self.graph.nodes:
|
|
if node.op == "placeholder":
|
|
# Removing default values for inputs as the forward pass will fail with them.
|
|
if node.target in non_concrete_arg_names:
|
|
node.args = ()
|
|
# Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor].
|
|
# It cannot infer on the attributes and methods the input should have, and fails.
|
|
node.type = torch.Tensor
|
|
# It is a concrete arg so it is not used and should be removed.
|
|
else:
|
|
if hasattr(torch.fx._symbolic_trace, "_assert_is_none"):
|
|
# Newer versions of torch.fx emit an assert statement
|
|
# for concrete arguments; delete those before we delete
|
|
# the concrete arg.
|
|
to_delete = []
|
|
for user in node.users:
|
|
if user.target == torch.fx._symbolic_trace._assert_is_none:
|
|
to_delete.append(user)
|
|
for user in to_delete:
|
|
self.graph.erase_node(user)
|
|
|
|
self.graph.erase_node(node)
|
|
|
|
# TODO: solves GraphModule creation.
|
|
# Without this, return type annotation "Tuple" is causing code execution failure.
|
|
if node.op == "output":
|
|
node.type = None
|
|
|
|
return self.graph
|
|
|
|
@contextmanager
|
|
def trace_activation_checkpoint(self, enabled: bool):
|
|
if enabled:
|
|
orig_ckpt_func = torch.utils.checkpoint.CheckpointFunction
|
|
|
|
class PatchedCheckpointFunction(torch.autograd.Function):
|
|
@staticmethod
|
|
def forward(ctx, run_function, preserve_rng_state, *args):
|
|
# signal that the current tracing occurs within activation checkpoint part
|
|
self.inside_torch_checkpoint_func = True
|
|
out = run_function(*args)
|
|
self.inside_torch_checkpoint_func = False
|
|
self.act_ckpt_region_count += 1
|
|
return out
|
|
|
|
@staticmethod
|
|
def backward(ctx: Any, *grad_outputs: Any) -> Any:
|
|
raise NotImplementedError(
|
|
"We do not implement the backward pass as we only trace the forward pass."
|
|
)
|
|
|
|
# override the checkpoint function
|
|
torch.utils.checkpoint.CheckpointFunction = PatchedCheckpointFunction
|
|
yield
|
|
|
|
if enabled:
|
|
# recover the checkpoint function upon exit
|
|
torch.utils.checkpoint.CheckpointFunction = orig_ckpt_func
|
|
|
|
def create_node(self, *args, **kwargs) -> Node:
|
|
node = super().create_node(*args, **kwargs)
|
|
|
|
if self.inside_torch_checkpoint_func:
|
|
# annotate the activation checkpoint module
|
|
node.meta["activation_checkpoint"] = self.act_ckpt_region_count
|
|
return node
|
|
|
|
|
|
def wrap_tensor_constructor_method(target):
|
|
def look_for_proxy(*args, **kwargs):
|
|
# find in pos vars
|
|
for arg in args:
|
|
if isinstance(arg, Proxy):
|
|
return arg
|
|
if isinstance(arg, (tuple, list)):
|
|
return look_for_proxy(*arg)
|
|
|
|
# find in keyword vars
|
|
for k, v in kwargs.items():
|
|
if isinstance(v, Proxy):
|
|
return v
|
|
if isinstance(v, (tuple, list)):
|
|
return look_for_proxy(*v)
|
|
return None
|
|
|
|
@functools.wraps(target)
|
|
def wrapper(*args, **kwargs):
|
|
proxy = look_for_proxy(*args, **kwargs)
|
|
|
|
if proxy is not None:
|
|
# if the arg is a proxy, then need to record this function called on this proxy
|
|
# e.g. torch.ones(size) where size is an input proxy
|
|
colo_proxy = proxy.tracer.create_proxy("call_function", target, args, kwargs)
|
|
if not isinstance(colo_proxy, ColoProxy):
|
|
meta_out = compute_meta_data_for_functions_proxy(target, args, kwargs)
|
|
colo_proxy = ColoProxy(proxy.node)
|
|
colo_proxy.meta_data = meta_out
|
|
return colo_proxy
|
|
else:
|
|
# this is called directly when the inputs do not contain proxy
|
|
# e.g. torch.ones(4) where the input is static
|
|
return target(*args, **kwargs)
|
|
|
|
return wrapper, target
|
|
|
|
|
|
# Patched magic methods for ColoProxy, then tracer could record the magic_method like __sub__,
|
|
# and add meta_data attribute to the created proxy.
|
|
for method in magic_methods:
|
|
|
|
def _scope(method):
|
|
def impl(*args, **kwargs):
|
|
tracer = args[0].tracer
|
|
target = getattr(operator, method)
|
|
proxy = tracer.create_proxy("call_function", target, args, kwargs)
|
|
if not isinstance(proxy, ColoProxy):
|
|
meta_out = compute_meta_data_for_functions_proxy(target, args, kwargs)
|
|
proxy = ColoProxy(proxy.node)
|
|
proxy.meta_data = meta_out
|
|
return proxy
|
|
|
|
impl.__name__ = method
|
|
as_magic = f'__{method.strip("_")}__'
|
|
setattr(ColoProxy, as_magic, impl)
|
|
|
|
_scope(method)
|
|
|
|
|
|
def _define_reflectable(orig_method_name):
|
|
method_name = f'__r{orig_method_name.strip("_")}__'
|
|
|
|
def impl(self, rhs):
|
|
target = getattr(operator, orig_method_name)
|
|
proxy = self.tracer.create_proxy("call_function", target, (rhs, self), {})
|
|
if not isinstance(proxy, ColoProxy):
|
|
meta_out = compute_meta_data_for_functions_proxy(target, *(rhs, self), {})
|
|
proxy = ColoProxy(proxy.node)
|
|
proxy.meta_data = meta_out
|
|
return proxy
|
|
|
|
impl.__name__ = method_name
|
|
impl.__qualname__ = method_name
|
|
setattr(ColoProxy, method_name, impl)
|
|
|
|
|
|
for orig_method_name in reflectable_magic_methods:
|
|
_define_reflectable(orig_method_name)
|