mirror of https://github.com/hpcaitech/ColossalAI
356 lines
12 KiB
Python
356 lines
12 KiB
Python
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
|
|
import torch
|
|
import torch.fx
|
|
from torch.autograd.profiler_util import _format_memory
|
|
from torch.fx import GraphModule
|
|
from torch.fx.node import Argument, Node, Target
|
|
|
|
from colossalai._analyzer._subclasses import flop_count
|
|
from colossalai._analyzer.fx.node_util import MetaInfo
|
|
|
|
|
|
def _format_flops(flops: float) -> str:
|
|
"""Returns a formatted FLOP size string"""
|
|
if flops > 1e12:
|
|
return f"{flops / 1e12:.2f} TFLOPs"
|
|
elif flops > 1e9:
|
|
return f"{flops / 1e9:.2f} GFLOPs"
|
|
elif flops > 1e6:
|
|
return f"{flops / 1e6:.2f} MFLOPs"
|
|
elif flops > 1e3:
|
|
return f"{flops / 1e3:.2f} kFLOPs"
|
|
return f"{flops} FLOPs"
|
|
|
|
|
|
def _denormalize_tuple(t: Tuple[int, ...]) -> Tuple[int, ...]:
|
|
return t[0] if len(t) == 1 else t
|
|
|
|
|
|
def _normalize_tuple(x):
|
|
if not isinstance(x, tuple):
|
|
return (x,)
|
|
return x
|
|
|
|
|
|
def _current_device(module):
|
|
return next(module.parameters()).device
|
|
|
|
|
|
class GraphProfiler(torch.fx.Interpreter):
|
|
"""
|
|
Fetch shape argument from ``ShapeProp`` without re-executing
|
|
the ``GraphModule`` from scratch.
|
|
"""
|
|
|
|
_profileable = [
|
|
"call_function",
|
|
"call_module",
|
|
"call_method",
|
|
]
|
|
|
|
def __init__(self, module: GraphModule, garbage_collect_values: bool = True):
|
|
super().__init__(module, garbage_collect_values)
|
|
|
|
def run(self, *args, initial_env: Optional[Dict[Node, Any]] = None, enable_io_processing: bool = True) -> Any:
|
|
"""
|
|
Run `module` via interpretation and return the result.
|
|
|
|
Args:
|
|
*args: The arguments to the Module to run, in positional order
|
|
initial_env (Optional[Dict[Node, Any]]): An optional starting environment for execution.
|
|
This is a dict mapping `Node` to any value. This can be used, for example, to
|
|
pre-populate results for certain `Nodes` so as to do only partial evaluation within
|
|
the interpreter.
|
|
enable_io_processing (bool): If true, we process the inputs and outputs with graph's process_inputs and
|
|
process_outputs function first before using them.
|
|
|
|
Returns:
|
|
Any: The value returned from executing the Module
|
|
"""
|
|
self.env = initial_env if initial_env else {}
|
|
|
|
# Positional function args are consumed left-to-right by
|
|
# `placeholder` nodes. Use an iterator to keep track of
|
|
# position and extract those values.
|
|
if enable_io_processing:
|
|
args = self.module.graph.process_inputs(*args)
|
|
self.args_iter: Iterator[Any] = iter(args)
|
|
|
|
for node in self.module.graph.nodes:
|
|
self.run_node(node) # No need to store.
|
|
|
|
if self.garbage_collect_values:
|
|
for to_delete in self.user_to_last_uses.get(node, []):
|
|
del self.env[to_delete]
|
|
|
|
if node.op == "output":
|
|
output_val = self.env[node]
|
|
return self.module.graph.process_outputs(output_val) if enable_io_processing else output_val
|
|
|
|
def fetch_initial_env(self, device=None) -> Dict[Node, Any]:
|
|
"""
|
|
Fetch ``initial_env`` for execution. This is because ``ShapeProp``
|
|
has already attached outputs of each ``Node`` to its ``MetaInfo``.
|
|
|
|
Args:
|
|
device (torch.device): The device to place the execution, default to ``None``
|
|
|
|
Returns:
|
|
Dict[Node, Any]: The initial environment for execution
|
|
"""
|
|
initial_env = {}
|
|
for n in self.module.graph.nodes:
|
|
initial_env[n] = _denormalize_tuple(MetaInfo(n).outputs)
|
|
return initial_env
|
|
|
|
def propagate(self, *args, device=None):
|
|
"""
|
|
Run `module` via interpretation and profile the execution
|
|
of each ``Node``.
|
|
|
|
Args:
|
|
*args (Tensor): The sample input, not used
|
|
device (torch.device): The device to place the execution, default to ``None``
|
|
|
|
Returns:
|
|
Any: The value returned from executing the Module
|
|
"""
|
|
initial_env = self.fetch_initial_env(device)
|
|
|
|
return self.run(initial_env=initial_env)
|
|
|
|
def summary(self) -> str:
|
|
"""
|
|
Summarizes the profiled statistics of the `GraphModule` in
|
|
tabular format. Note that this API requires the ``tabulate`` module
|
|
to be installed.
|
|
|
|
Returns:
|
|
str: The summary of the profiled statistics
|
|
"""
|
|
# https://github.com/pytorch/pytorch/blob/master/torch/fx/graph.py
|
|
try:
|
|
from tabulate import tabulate
|
|
except ImportError:
|
|
print(
|
|
"`summary` relies on the library `tabulate`, "
|
|
"which could not be found on this machine. Run `pip "
|
|
"install tabulate` to install the library."
|
|
)
|
|
|
|
# Build up a list of summary information for each node
|
|
node_summaries: List[List[Any]] = []
|
|
last_n_info = None
|
|
|
|
for node in self.module.graph.nodes:
|
|
node: Node
|
|
n_info = MetaInfo(node)
|
|
last_n_info = last_n_info or n_info
|
|
node_summaries.append(
|
|
[
|
|
node.op,
|
|
str(node),
|
|
_format_memory(n_info.accumulate_size),
|
|
_format_memory(n_info.accumulate_size - last_n_info.accumulate_size),
|
|
_format_memory(n_info.output_size),
|
|
_format_memory(n_info.temp_size),
|
|
_format_memory(n_info.param_size),
|
|
_format_memory(n_info.backward_size),
|
|
_format_flops(n_info.fwd_flop),
|
|
_format_flops(n_info.bwd_flop),
|
|
]
|
|
)
|
|
last_n_info = n_info
|
|
|
|
# Use the ``tabulate`` library to create a well-formatted table
|
|
# presenting our summary information
|
|
headers: List[str] = [
|
|
"Op type",
|
|
"Op",
|
|
"Accumulate size",
|
|
"Incremental size",
|
|
"Output size",
|
|
"Temp size",
|
|
"Param size",
|
|
"Backward size",
|
|
"Fwd FLOPs",
|
|
"Bwd FLOPs",
|
|
]
|
|
|
|
return tabulate(node_summaries, headers=headers, stralign="right")
|
|
|
|
|
|
class CommunicationProfiler(GraphProfiler):
|
|
"""
|
|
TODO(lyl): Add this for all comm nodes
|
|
"""
|
|
|
|
def __init__(self, module: GraphModule, garbage_collect_values: bool = True):
|
|
raise NotImplementedError()
|
|
|
|
|
|
class FlopProfiler(GraphProfiler):
|
|
"""
|
|
Execute an FX graph Node-by-Node and record the meta data of the result
|
|
into the corresponding node.
|
|
|
|
Usage:
|
|
>>> model = MyModule()
|
|
>>> x = torch.rand(10, 10)
|
|
>>> gm = colossalai.fx.symbolic_trace(model, meta_args = {'x': x}})
|
|
>>> shape_interp = ShapeProp(gm) # must do this first
|
|
>>> shape_interp.propagate(x)
|
|
>>> profiler = FlopProfiler(gm)
|
|
>>> profiler.propagate(x)
|
|
|
|
Args:
|
|
module (GraphModule): The module to be executed
|
|
|
|
Hints:
|
|
If you want to add a new flop count rule, you can first
|
|
check the existing files in ``../_subclasses/flop_tensor.py``.
|
|
If your flop count rules are incompatible with the existing
|
|
ones, you can do so by adding a new method to this class
|
|
with the ``@register_flop_count_impl`` decorator. The method
|
|
should take (*args, **kwargs) instance as its input and
|
|
generate flop count for both forward and backward as its
|
|
output.
|
|
|
|
For example, if you want to add a flop count rule for
|
|
``my_fn``, which is a hand-written operand not detected by
|
|
PyTorch, you can do so by adding a new method to this
|
|
class with the ``@register_flop_count_impl`` decorator:
|
|
|
|
>>> @register_flop_count_impl(my_fn)
|
|
>>> def my_fn_flop_count_impl(*args, **kwargs):
|
|
>>> return 0, 0
|
|
"""
|
|
|
|
_custom_flop_count_impl = {}
|
|
|
|
def run_node(self, n: torch.fx.Node) -> Any:
|
|
"""
|
|
Run a specific node ``n`` and profile its execution time and memory usage.
|
|
Calls into call_function, call_method, and call_module only.
|
|
|
|
Args:
|
|
n (Node): The Node to profile
|
|
|
|
Returns:
|
|
Any: The output of the node
|
|
|
|
Raises:
|
|
RuntimeError: If the node is not profileable.
|
|
"""
|
|
args, kwargs = self.fetch_args_kwargs_from_env(n)
|
|
n_info = MetaInfo(n)
|
|
|
|
if n.op in self._profileable:
|
|
try:
|
|
(
|
|
n_info.fwd_flop,
|
|
n_info.bwd_flop,
|
|
) = getattr(
|
|
self, n.op
|
|
)(n.target, args, kwargs)
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
f"Error {str(e)} occurred when profiling node {n}, node.target = {n.target}. "
|
|
f"Please refer to function's docstring to register the relevant profile_impl for this node!"
|
|
) from e
|
|
|
|
# retain the autograd graph
|
|
for param in self.module.parameters():
|
|
param.grad = None
|
|
|
|
return _denormalize_tuple(n_info.outputs)
|
|
|
|
def call_function(self, target: "Target", args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
|
|
"""
|
|
Execute a ``call_function`` node and return the profiling result.
|
|
Dispatch to ``_custom_flop_count_impl`` if ``call_function`` should be
|
|
profiled in a user-defined behavior.
|
|
|
|
Args:
|
|
target (Target): The call target for this node. See
|
|
`Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
|
|
details on semantics
|
|
args (Tuple): Tuple of positional args for this invocation
|
|
kwargs (Dict): Dict of keyword arguments for this invocation
|
|
|
|
Return
|
|
flop_count (Tuple[int]): (fwd_flop, bwd_flop)
|
|
"""
|
|
assert not isinstance(target, str)
|
|
|
|
# Dispatch the impl for profiling, default will be ``flop_count``
|
|
if target in self._custom_flop_count_impl:
|
|
return self._custom_flop_count_impl[target](*args, **kwargs)
|
|
else:
|
|
return flop_count(target, *args, **kwargs)
|
|
|
|
def call_method(self, target: "Target", args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
|
|
"""
|
|
Execute a ``call_method`` node and return the profiling result.
|
|
|
|
Args:
|
|
target (Target): The call target for this node. See
|
|
`Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
|
|
details on semantics
|
|
args (Tuple): Tuple of positional args for this invocation
|
|
kwargs (Dict): Dict of keyword arguments for this invocation
|
|
|
|
Return
|
|
flop_count (Tuple[int]): (fwd_flop, bwd_flop)
|
|
"""
|
|
# Execute the method and return the result
|
|
assert isinstance(target, str)
|
|
return flop_count(getattr(torch.Tensor, target), *args, **kwargs)
|
|
|
|
def call_module(self, target: "Target", args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
|
|
"""
|
|
Execute a ``call_module`` node and return the profiling result.
|
|
|
|
Args:
|
|
target (Target): The call target for this node. See
|
|
`Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
|
|
details on semantics
|
|
args (Tuple): Tuple of positional args for this invocation
|
|
kwargs (Dict): Dict of keyword arguments for this invocation
|
|
|
|
Return
|
|
flop_count (Tuple[int]): (fwd_flop, bwd_flop)
|
|
"""
|
|
# Retrieve executed args and kwargs values from the environment
|
|
|
|
# Execute the method and return the result
|
|
assert isinstance(target, str)
|
|
submod = self.fetch_attr(target)
|
|
return flop_count(submod, *args, **kwargs)
|
|
|
|
|
|
def graph_profile_pass(module: GraphModule, *args, verbose=False) -> GraphModule:
|
|
"""
|
|
Run ``module`` via interpretation and profile the execution
|
|
of each ``Node``.
|
|
|
|
Args:
|
|
module (GraphModule): The GraphModule to profile
|
|
*args (Any): The sample input, not used
|
|
verbose (bool): Whether to print the profiling summary
|
|
|
|
Returns:
|
|
GraphModule: The same GraphModule with profiling information
|
|
"""
|
|
for profiler_cls in (
|
|
FlopProfiler,
|
|
# CommunicationProfiler, # TODO: add communication profiling
|
|
):
|
|
profiler = profiler_cls(module)
|
|
profiler.propagate(*args, device=_current_device(module))
|
|
|
|
if verbose:
|
|
print(profiler.summary())
|
|
return module
|