ColossalAI/colossalai/_analyzer/fx/passes/graph_profile.py

from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import torch
import torch.fx
from torch.autograd.profiler_util import _format_memory, _format_time
from torch.fx import GraphModule
from torch.fx.node import Argument, Node, Target

from colossalai._analyzer._subclasses import flop_count
from colossalai._analyzer.fx.node_util import MetaInfo


def _format_flops(flops: float) -> str:
    """Returns a formatted FLOP size string"""
    if flops > 1e12:
        return f'{flops / 1e12:.2f} TFLOPs'
    elif flops > 1e9:
        return f'{flops / 1e9:.2f} GFLOPs'
    elif flops > 1e6:
        return f'{flops / 1e6:.2f} MFLOPs'
    elif flops > 1e3:
        return f'{flops / 1e3:.2f} kFLOPs'
    return f'{flops} FLOPs'


def _denormalize_tuple(t: Tuple[int, ...]) -> Tuple[int, ...]:
    return t[0] if len(t) == 1 else t


def _normalize_tuple(x):
    if not isinstance(x, tuple):
        return (x,)
    return x


def _current_device(module):
    return next(module.parameters()).device


class GraphProfiler(torch.fx.Interpreter):
    """
    Fetch shape argument from ``ShapeProp`` without re-executing
    the ``GraphModule`` from scratch.
    """
    _profileable = [
        'call_function',
        'call_module',
        'call_method',
    ]

    def __init__(self, module: GraphModule, garbage_collect_values: bool = True):
        super().__init__(module, garbage_collect_values)

    def run(self, *args, initial_env: Optional[Dict[Node, Any]] = None, enable_io_processing: bool = True) -> Any:
        """
        Run `module` via interpretation and return the result.

        Args:
            *args: The arguments to the Module to run, in positional order
            initial_env (Optional[Dict[Node, Any]]): An optional starting environment for execution.
                This is a dict mapping `Node` to any value. This can be used, for example, to
                pre-populate results for certain `Nodes` so as to do only partial evaluation within
                the interpreter.
            enable_io_processing (bool): If true, we process the inputs and outputs with graph's process_inputs and
                process_outputs function first before using them.

        Returns:
            Any: The value returned from executing the Module
        """
        self.env = initial_env if initial_env else {}

        # Positional function args are consumed left-to-right by
        # `placeholder` nodes. Use an iterator to keep track of
        # position and extract those values.
        if enable_io_processing:
            args = self.module.graph.process_inputs(*args)
        self.args_iter: Iterator[Any] = iter(args)

        for node in self.module.graph.nodes:

            self.run_node(node)    # No need to store.

            if self.garbage_collect_values:
                for to_delete in self.user_to_last_uses.get(node, []):
                    del self.env[to_delete]

            if node.op == 'output':
                output_val = self.env[node]
                return self.module.graph.process_outputs(output_val) if enable_io_processing else output_val

    def fetch_initial_env(self, device=None) -> Dict[Node, Any]:
        """
        Fetch ``initial_env`` for execution. This is because ``ShapeProp``
        has already attached outputs of each ``Node`` to its ``MetaInfo``.

        Args:
            device (torch.device): The device to place the execution, default to ``None``

        Returns:
            Dict[Node, Any]: The initial environment for execution
        """
        initial_env = {}
        for n in self.module.graph.nodes:
            initial_env[n] = _denormalize_tuple(MetaInfo(n).outputs)
        return initial_env

    def propagate(self, *args, device=None):
        """
        Run `module` via interpretation and profile the execution
        of each ``Node``.

        Args:
            *args (Tensor): The sample input, not used
            device (torch.device): The device to place the execution, default to ``None``

        Returns:
            Any: The value returned from executing the Module
        """
        initial_env = self.fetch_initial_env(device)

        return self.run(initial_env=initial_env)

    def summary(self) -> str:
        """
        Summarizes the profiled statistics of the `GraphModule` in
        tabular format. Note that this API requires the ``tabulate`` module
        to be installed.

        Returns:
            str: The summary of the profiled statistics
        """
        # https://github.com/pytorch/pytorch/blob/master/torch/fx/graph.py
        try:
            from tabulate import tabulate
        except ImportError:
            print("`summary` relies on the library `tabulate`, "
                  "which could not be found on this machine. Run `pip "
                  "install tabulate` to install the library.")

        # Build up a list of summary information for each node
        node_summaries: List[List[Any]] = []
        last_n_info = None

        for node in self.module.graph.nodes:
            node: Node
            n_info = MetaInfo(node)
            last_n_info = last_n_info or n_info
            node_summaries.append([
                node.op,
                str(node),
                _format_memory(n_info.accumulate_size),
                _format_memory(n_info.accumulate_size - last_n_info.accumulate_size),
                _format_memory(n_info.output_size),
                _format_memory(n_info.temp_size),
                _format_memory(n_info.param_size),
                _format_memory(n_info.backward_size),
                _format_flops(n_info.fwd_flop),
                _format_flops(n_info.bwd_flop),
            ])
            last_n_info = n_info

        # Use the ``tabulate`` library to create a well-formatted table
        # presenting our summary information
        headers: List[str] = [
            'Op type',
            'Op',
            'Accumulate size',
            'Incremental size',
            'Output size',
            'Temp size',
            'Param size',
            'Backward size',
            'Fwd FLOPs',
            'Bwd FLOPs',
        ]

        return tabulate(node_summaries, headers=headers, stralign='right')


class CommunicationProfiler(GraphProfiler):
    """
    TODO(lyl): Add this for all comm nodes
    """

    def __init__(self, module: GraphModule, garbage_collect_values: bool = True):
        raise NotImplementedError()


class FlopProfiler(GraphProfiler):
    """
    Execute an FX graph Node-by-Node and record the meta data of the result
    into the corresponding node.

    Usage:
        >>> model = MyModule()
        >>> x = torch.rand(10, 10)
        >>> gm = colossalai.fx.symbolic_trace(model, meta_args = {'x': x}})
        >>> shape_interp = ShapeProp(gm)    # must do this first
        >>> shape_interp.propagate(x)
        >>> profiler = FlopProfiler(gm)
        >>> profiler.propagate(x)

    Args:
        module (GraphModule): The module to be executed

    Hints:
        If you want to add a new flop count rule, you can first
        check the existing files in ``../_subclasses/flop_tensor.py``.
        If your flop count rules are incompatible with the existing
        ones, you can do so by adding a new method to this class
        with the ``@register_flop_count_impl`` decorator. The method
        should take (*args, **kwargs) instance as its input and
        generate flop count for both forward and backward as its
        output.

        For example, if you want to add a flop count rule for
        ``my_fn``, which is a hand-written operand not detected by
        PyTorch, you can do so by adding a new method to this
        class with the ``@register_flop_count_impl`` decorator:

        >>> @register_flop_count_impl(my_fn)
        >>> def my_fn_flop_count_impl(*args, **kwargs):
        >>>     return 0, 0
    """
    _custom_flop_count_impl = {}

    def run_node(self, n: torch.fx.Node) -> Any:
        """
        Run a specific node ``n`` and profile its execution time and memory usage.
        Calls into call_function, call_method, and call_module only.

        Args:
            n (Node): The Node to profile

        Returns:
            Any: The output of the node

        Raises:
            RuntimeError: If the node is not profileable.
        """
        args, kwargs = self.fetch_args_kwargs_from_env(n)
        n_info = MetaInfo(n)

        if n.op in self._profileable:
            try:
                (
                    n_info.fwd_flop,
                    n_info.bwd_flop,
                ) = getattr(self, n.op)(n.target, args, kwargs)
            except Exception as e:
                raise RuntimeError(
                    f'Error {str(e)} occurred when profiling node {n}, node.target = {n.target}. '
                    f'Please refer to function\'s docstring to register the relevant profile_impl for this node!'
                ) from e

        # retain the autograd graph
        for param in self.module.parameters():
            param.grad = None

        return _denormalize_tuple(n_info.outputs)

    def call_function(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
        """
        Execute a ``call_function`` node and return the profiling result.
        Dispatch to ``_custom_flop_count_impl`` if ``call_function`` should be
        profiled in a user-defined behavior.

        Args:
            target (Target): The call target for this node. See
                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
                details on semantics
            args (Tuple): Tuple of positional args for this invocation
            kwargs (Dict): Dict of keyword arguments for this invocation

        Return
            flop_count (Tuple[int]): (fwd_flop, bwd_flop)
        """
        assert not isinstance(target, str)

        # Dispatch the impl for profiling, default will be ``flop_count``
        if target in self._custom_flop_count_impl:
            return self._custom_flop_count_impl[target](*args, **kwargs)
        else:
            return flop_count(target, *args, **kwargs)

    def call_method(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
        """
        Execute a ``call_method`` node and return the profiling result.

        Args:
            target (Target): The call target for this node. See
                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
                details on semantics
            args (Tuple): Tuple of positional args for this invocation
            kwargs (Dict): Dict of keyword arguments for this invocation

        Return
            flop_count (Tuple[int]): (fwd_flop, bwd_flop)
        """
        # Execute the method and return the result
        assert isinstance(target, str)
        return flop_count(getattr(torch.Tensor, target), *args, **kwargs)

    def call_module(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
        """
        Execute a ``call_module`` node and return the profiling result.

        Args:
            target (Target): The call target for this node. See
                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
                details on semantics
            args (Tuple): Tuple of positional args for this invocation
            kwargs (Dict): Dict of keyword arguments for this invocation

        Return
            flop_count (Tuple[int]): (fwd_flop, bwd_flop)
        """
        # Retrieve executed args and kwargs values from the environment

        # Execute the method and return the result
        assert isinstance(target, str)
        submod = self.fetch_attr(target)
        return flop_count(submod, *args, **kwargs)


def graph_profile_pass(module: GraphModule, *args, verbose=False) -> GraphModule:
    """
    Run ``module`` via interpretation and profile the execution
    of each ``Node``.

    Args:
        module (GraphModule): The GraphModule to profile
        *args (Any): The sample input, not used
        verbose (bool): Whether to print the profiling summary

    Returns:
        GraphModule: The same GraphModule with profiling information
    """
    for profiler_cls in (FlopProfiler,
    # CommunicationProfiler,    # TODO: add communication profiling
                        ):
        profiler = profiler_cls(module)
        profiler.propagate(*args, device=_current_device(module))

    if verbose:
        print(profiler.summary())
    return module
[analyzer] a minimal implementation of static graph analyzer (#2852) * [hotfix] meta tensor default device. * [siu] add experimental submodules to main branch. * [siu] * [siu] * [analyzer] init. * [analyzer] readme. * [analyzer] readme. * [analyzer] readme. * [analyzer] readme. * [test] add test. * Update symbolic_trace.py * mark skip tests. * try except. * try except. * try except. * s * init * init * fix * skip * skip --------- Co-authored-by: Daniel Shao <superdainiu@MININT-PVARVID.fareast.corp.microsoft.com> Co-authored-by: Daniel Shao <superdainiu@Daniels-Mac.local> 2023-03-10 05:21:05 +00:00			`from typing import Any, Dict, Iterator, List, Optional, Tuple, Union`

			`import torch`
			`import torch.fx`
			`from torch.autograd.profiler_util import _format_memory, _format_time`
			`from torch.fx import GraphModule`
			`from torch.fx.node import Argument, Node, Target`

			`from colossalai._analyzer._subclasses import flop_count`
			`from colossalai._analyzer.fx.node_util import MetaInfo`


			`def _format_flops(flops: float) -> str:`
			`"""Returns a formatted FLOP size string"""`
			`if flops > 1e12:`
			`return f'{flops / 1e12:.2f} TFLOPs'`
			`elif flops > 1e9:`
			`return f'{flops / 1e9:.2f} GFLOPs'`
			`elif flops > 1e6:`
			`return f'{flops / 1e6:.2f} MFLOPs'`
			`elif flops > 1e3:`
			`return f'{flops / 1e3:.2f} kFLOPs'`
			`return f'{flops} FLOPs'`


			`def _denormalize_tuple(t: Tuple[int, ...]) -> Tuple[int, ...]:`
			`return t[0] if len(t) == 1 else t`


			`def _normalize_tuple(x):`
			`if not isinstance(x, tuple):`
			`return (x,)`
			`return x`


			`def _current_device(module):`
			`return next(module.parameters()).device`


			`class GraphProfiler(torch.fx.Interpreter):`
			`"""`
			Fetch shape argument from ``ShapeProp`` without re-executing
			the ``GraphModule`` from scratch.
			`"""`
			`_profileable = [`
			`'call_function',`
			`'call_module',`
			`'call_method',`
			`]`

			`def __init__(self, module: GraphModule, garbage_collect_values: bool = True):`
			`super().__init__(module, garbage_collect_values)`

			`def run(self, *args, initial_env: Optional[Dict[Node, Any]] = None, enable_io_processing: bool = True) -> Any:`
			`"""`
			Run `module` via interpretation and return the result.

			`Args:`
			`*args: The arguments to the Module to run, in positional order`
			`initial_env (Optional[Dict[Node, Any]]): An optional starting environment for execution.`
			This is a dict mapping `Node` to any value. This can be used, for example, to
			pre-populate results for certain `Nodes` so as to do only partial evaluation within
			`the interpreter.`
			`enable_io_processing (bool): If true, we process the inputs and outputs with graph's process_inputs and`
			`process_outputs function first before using them.`

			`Returns:`
			`Any: The value returned from executing the Module`
			`"""`
			`self.env = initial_env if initial_env else {}`

			`# Positional function args are consumed left-to-right by`
			# `placeholder` nodes. Use an iterator to keep track of
			`# position and extract those values.`
			`if enable_io_processing:`
			`args = self.module.graph.process_inputs(*args)`
			`self.args_iter: Iterator[Any] = iter(args)`

			`for node in self.module.graph.nodes:`

			`self.run_node(node) # No need to store.`

			`if self.garbage_collect_values:`
			`for to_delete in self.user_to_last_uses.get(node, []):`
			`del self.env[to_delete]`

			`if node.op == 'output':`
			`output_val = self.env[node]`
			`return self.module.graph.process_outputs(output_val) if enable_io_processing else output_val`

			`def fetch_initial_env(self, device=None) -> Dict[Node, Any]:`
			`"""`
			Fetch ``initial_env`` for execution. This is because ``ShapeProp``
			has already attached outputs of each ``Node`` to its ``MetaInfo``.

			`Args:`
			device (torch.device): The device to place the execution, default to ``None``

			`Returns:`
			`Dict[Node, Any]: The initial environment for execution`
			`"""`
			`initial_env = {}`
			`for n in self.module.graph.nodes:`
			`initial_env[n] = _denormalize_tuple(MetaInfo(n).outputs)`
			`return initial_env`

			`def propagate(self, *args, device=None):`
			`"""`
			Run `module` via interpretation and profile the execution
			of each ``Node``.

			`Args:`
			`*args (Tensor): The sample input, not used`
			device (torch.device): The device to place the execution, default to ``None``

			`Returns:`
			`Any: The value returned from executing the Module`
			`"""`
			`initial_env = self.fetch_initial_env(device)`

			`return self.run(initial_env=initial_env)`

			`def summary(self) -> str:`
			`"""`
			Summarizes the profiled statistics of the `GraphModule` in
			tabular format. Note that this API requires the ``tabulate`` module
			`to be installed.`

			`Returns:`
			`str: The summary of the profiled statistics`
			`"""`
			`# https://github.com/pytorch/pytorch/blob/master/torch/fx/graph.py`
			`try:`
			`from tabulate import tabulate`
			`except ImportError:`
			print("`summary` relies on the library `tabulate`, "
			"which could not be found on this machine. Run `pip "
			"install tabulate` to install the library.")

			`# Build up a list of summary information for each node`
			`node_summaries: List[List[Any]] = []`
			`last_n_info = None`

			`for node in self.module.graph.nodes:`
			`node: Node`
			`n_info = MetaInfo(node)`
			`last_n_info = last_n_info or n_info`
			`node_summaries.append([`
			`node.op,`
			`str(node),`
			`_format_memory(n_info.accumulate_size),`
			`_format_memory(n_info.accumulate_size - last_n_info.accumulate_size),`
			`_format_memory(n_info.output_size),`
			`_format_memory(n_info.temp_size),`
			`_format_memory(n_info.param_size),`
			`_format_memory(n_info.backward_size),`
			`_format_flops(n_info.fwd_flop),`
			`_format_flops(n_info.bwd_flop),`
			`])`
			`last_n_info = n_info`

			# Use the ``tabulate`` library to create a well-formatted table
			`# presenting our summary information`
			`headers: List[str] = [`
			`'Op type',`
			`'Op',`
			`'Accumulate size',`
			`'Incremental size',`
			`'Output size',`
			`'Temp size',`
			`'Param size',`
			`'Backward size',`
			`'Fwd FLOPs',`
			`'Bwd FLOPs',`
			`]`

			`return tabulate(node_summaries, headers=headers, stralign='right')`


			`class CommunicationProfiler(GraphProfiler):`
			`"""`
			`TODO(lyl): Add this for all comm nodes`
			`"""`

			`def __init__(self, module: GraphModule, garbage_collect_values: bool = True):`
			`raise NotImplementedError()`


			`class FlopProfiler(GraphProfiler):`
			`"""`
			`Execute an FX graph Node-by-Node and record the meta data of the result`
			`into the corresponding node.`

			`Usage:`
			`>>> model = MyModule()`
			`>>> x = torch.rand(10, 10)`
			`>>> gm = colossalai.fx.symbolic_trace(model, meta_args = {'x': x}})`
			`>>> shape_interp = ShapeProp(gm) # must do this first`
			`>>> shape_interp.propagate(x)`
			`>>> profiler = FlopProfiler(gm)`
			`>>> profiler.propagate(x)`

			`Args:`
			`module (GraphModule): The module to be executed`

			`Hints:`
			`If you want to add a new flop count rule, you can first`
			check the existing files in ``../_subclasses/flop_tensor.py``.
			`If your flop count rules are incompatible with the existing`
			`ones, you can do so by adding a new method to this class`
			with the ``@register_flop_count_impl`` decorator. The method
			`should take (args, *kwargs) instance as its input and`
			`generate flop count for both forward and backward as its`
			`output.`

			`For example, if you want to add a flop count rule for`
			``my_fn``, which is a hand-written operand not detected by
			`PyTorch, you can do so by adding a new method to this`
			class with the ``@register_flop_count_impl`` decorator:

			`>>> @register_flop_count_impl(my_fn)`
			`>>> def my_fn_flop_count_impl(args, *kwargs):`
			`>>> return 0, 0`
			`"""`
			`_custom_flop_count_impl = {}`

			`def run_node(self, n: torch.fx.Node) -> Any:`
			`"""`
			Run a specific node ``n`` and profile its execution time and memory usage.
			`Calls into call_function, call_method, and call_module only.`

			`Args:`
			`n (Node): The Node to profile`

			`Returns:`
			`Any: The output of the node`

			`Raises:`
			`RuntimeError: If the node is not profileable.`
			`"""`
			`args, kwargs = self.fetch_args_kwargs_from_env(n)`
			`n_info = MetaInfo(n)`

			`if n.op in self._profileable:`
			`try:`
			`(`
			`n_info.fwd_flop,`
			`n_info.bwd_flop,`
			`) = getattr(self, n.op)(n.target, args, kwargs)`
			`except Exception as e:`
			`raise RuntimeError(`
			`f'Error {str(e)} occurred when profiling node {n}, node.target = {n.target}. '`
			`f'Please refer to function\'s docstring to register the relevant profile_impl for this node!'`
			`) from e`

			`# retain the autograd graph`
			`for param in self.module.parameters():`
			`param.grad = None`

			`return _denormalize_tuple(n_info.outputs)`

			`def call_function(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:`
			`"""`
			Execute a ``call_function`` node and return the profiling result.
			Dispatch to ``_custom_flop_count_impl`` if ``call_function`` should be
			`profiled in a user-defined behavior.`

			`Args:`
			`target (Target): The call target for this node. See`
			`Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
			`details on semantics`
			`args (Tuple): Tuple of positional args for this invocation`
			`kwargs (Dict): Dict of keyword arguments for this invocation`

			`Return`
			`flop_count (Tuple[int]): (fwd_flop, bwd_flop)`
			`"""`
			`assert not isinstance(target, str)`

			# Dispatch the impl for profiling, default will be ``flop_count``
			`if target in self._custom_flop_count_impl:`
			`return self._custom_flop_count_impl[target](args, *kwargs)`
			`else:`
			`return flop_count(target, args, *kwargs)`

			`def call_method(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:`
			`"""`
			Execute a ``call_method`` node and return the profiling result.

			`Args:`
			`target (Target): The call target for this node. See`
			`Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
			`details on semantics`
			`args (Tuple): Tuple of positional args for this invocation`
			`kwargs (Dict): Dict of keyword arguments for this invocation`

			`Return`
			`flop_count (Tuple[int]): (fwd_flop, bwd_flop)`
			`"""`
			`# Execute the method and return the result`
			`assert isinstance(target, str)`
			`return flop_count(getattr(torch.Tensor, target), args, *kwargs)`

			`def call_module(self, target: 'Target', args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:`
			`"""`
			Execute a ``call_module`` node and return the profiling result.

			`Args:`
			`target (Target): The call target for this node. See`
			`Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
			`details on semantics`
			`args (Tuple): Tuple of positional args for this invocation`
			`kwargs (Dict): Dict of keyword arguments for this invocation`

			`Return`
			`flop_count (Tuple[int]): (fwd_flop, bwd_flop)`
			`"""`
			`# Retrieve executed args and kwargs values from the environment`

			`# Execute the method and return the result`
			`assert isinstance(target, str)`
			`submod = self.fetch_attr(target)`
			`return flop_count(submod, args, *kwargs)`


			`def graph_profile_pass(module: GraphModule, *args, verbose=False) -> GraphModule:`
			`"""`
			Run ``module`` via interpretation and profile the execution
			of each ``Node``.

			`Args:`
			`module (GraphModule): The GraphModule to profile`
			`*args (Any): The sample input, not used`
			`verbose (bool): Whether to print the profiling summary`

			`Returns:`
			`GraphModule: The same GraphModule with profiling information`
			`"""`
			`for profiler_cls in (FlopProfiler,`
			`# CommunicationProfiler, # TODO: add communication profiling`
			`):`
			`profiler = profiler_cls(module)`
			`profiler.propagate(*args, device=_current_device(module))`

			`if verbose:`
			`print(profiler.summary())`
			`return module`