ColossalAI/colossalai/_analyzer/fx/node_util.py

from dataclasses import dataclass, field
from typing import Callable, ClassVar, Dict, List, Optional, Tuple, Union

import torch
from torch.autograd.profiler_util import _format_memory, _format_time
from torch.fx import Graph, GraphModule, Node

from colossalai._analyzer.envs import MeshConfig


def intersect(a, b):
    return {k: a[k] for k in a if k in b}


def subtract(a, b):
    return {k: a[k] for k in a if k not in b}


def union(a, b):
    return {**a, **b}


def compute_size_in_bytes(elem: torch.Tensor | Dict | List | Tuple | int) -> int:
    """Compute the size of a tensor or a collection of tensors in bytes.

    Args:
        elem (torch.Tensor | Dict | List | Tuple | int): Arbitrary nested ``torch.Tensor`` data structure.

    Returns:
        int: The size of the tensor or the collection of tensors in bytes.
    """
    nbytes = 0
    if isinstance(elem, torch.Tensor):
        if elem.is_quantized:
            nbytes += elem.numel() * torch._empty_affine_quantized([], dtype=elem.dtype).element_size()
        else:
            nbytes += elem.numel() * torch.tensor([], dtype=elem.dtype).element_size()
    elif isinstance(elem, dict):
        value_list = [v for _, v in elem.items()]
        nbytes += compute_size_in_bytes(value_list)
    elif isinstance(elem, tuple) or isinstance(elem, list) or isinstance(elem, set):
        for e in elem:
            nbytes += compute_size_in_bytes(e)
    return nbytes


@dataclass
class MetaInfo:
    r"""
    The base class to store all profiling and static graph analysis information
    needed for auto-parallel system in Colossal-AI.
    ============================================================================
                            -------------------------------
                            |          FX.Node            |    <-----
    [input/param] are  ---> |[input/param]      [grad_inp]|    [grad_inp] contributes to the
    placeholders (might be  |     | \__________     |     |    profiled peak memory in backward
    saved for backward.     |     |            \    |     |    pass. [grad_param] is calculated
                            |     |             \   |     |    separately.
                            | [interm] -------> [grad_int]|    <-----
                            |     |  \_________     |     |    [grad_interm] marks the peak
                            |    / \           \    |     |    memory in backward pass.
    [x] is not counted ---> | [x]  [interm] --> [grad_int]|    <-----
    in [interm] because     |          |  \_____    |     |
    it is not saved for     |          |        \   |     |
    backward.               |      [output]      \  |     |    <----- [output] is potentially
                            -------------------------------    [input] for the next node.
    ============================================================================

    Accumulate Size = ALL_PREVIOUS_CTX U {Interm Size + Output Size}
    Output Size = ([output] in global_ctx and not is_alias)
    Temp Size = ([output] not in global_ctx and not is_alias)
    Backward Size = ([grad_inp])

    Usage:
        >>> for node in graph.nodes:
        >>>     n_info = MetaInfo(node)     # will create a new MetaInfo instance and store in node.meta['info']
        >>>                                 # if not exist, otherwise return the existing one
        >>>     n_info.to_recompute = ...   # set the to_recompute attribute

    Remarks:
        This feature is experimental and all the entries are subject to change.
    """

    # reference
    node: Node

    # directory
    mod_dir: str = ''

    # ctx[data_ptr] = Tensor
    # mark the storage for ctx.save_for_backward
    global_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {})    # globally shared
    curr_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {})    # global_ctx till this node

    # should be updated after each graph manipulation
    # ============================== Update ====================================
    # parameter and buffer within ``Node``
    parameters: Dict[str, torch.nn.Parameter] = field(default_factory=lambda: {})
    buffers: Dict[str, torch.Tensor] = field(default_factory=lambda: {})

    inputs: Tuple[torch.Tensor] = ()
    outputs: Tuple[torch.Tensor] = ()
    is_alias: Tuple[bool] = ()    # whether the output is an alias of input

    # compute cost
    fwd_flop: Optional[int] = 0
    bwd_flop: Optional[int] = 0

    # communication cost (should be the size in bytes of communication)
    fwd_comm: Optional[int] = 0
    bwd_comm: Optional[int] = 0

    # should keep the same whenever manipulated
    # ============================= Invariant ==================================
    to_recompute: Tuple[torch.Tensor] = ()    # (region_0, region_1, ...) support nested codegen
    to_offload: Optional[bool] = False
    sharding_spec: str = 'RR'

    def __new__(cls, node: Node, **kwargs):
        orig_init = cls.__init__

        # if initialized, return the existing one
        # should disable the __init__ function
        if node.meta.get('info', None) is not None:

            def _dummy(self, *args, **kwargs):
                if getattr(self, '_is_init', False):
                    self._is_init = True
                    orig_init(self, *args, **kwargs)
                cls.__init__ = orig_init

            cls.__init__ = _dummy
            return node.meta['info']
        return super().__new__(cls)

    def __post_init__(self):
        self.node.meta['info'] = self

    @property
    def fwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):
        return self.fwd_flop / tflops + self.fwd_comm / bandwidth

    @property
    def bwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):
        return self.bwd_flop / tflops + self.bwd_comm / bandwidth

    @property
    def param_size(self):
        return compute_size_in_bytes(self.parameters)

    @property
    def buffer_size(self):
        return compute_size_in_bytes(self.buffers)

    @property
    def output_size(self):
        """Used in CheckpointSolver"""
        output_ctx = {
            o.data_ptr(): o
            for o, is_alias in zip(self.outputs, self.is_alias)
            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
        }
        return compute_size_in_bytes(intersect(self.global_ctx, output_ctx))

    @property
    def accumulate_size(self):
        """Used in CheckpointSolver"""
        output_ctx = {
            o.data_ptr(): o
            for o, is_alias in zip(self.outputs, self.is_alias)
            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
        }
        return compute_size_in_bytes(union(self.curr_ctx, intersect(self.global_ctx, output_ctx)))

    @property
    def temp_size(self):
        """Used in CheckpointSolver"""
        output_ctx = {
            o.data_ptr(): o
            for o, is_alias in zip(self.outputs, self.is_alias)
            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
        }
        return compute_size_in_bytes(subtract(output_ctx, self.global_ctx))

    @property
    def backward_size(self):
        """Used in CheckpointSolver"""
        return compute_size_in_bytes(self.inputs)

    def __repr__(self):
        s = f'Node {self.node.name}'
        if self.parameters:
            s += f'\n\thas parameter of size {_format_memory(self.param_size)}'
        if self.buffers:
            s += f'\n\thas buffer of size {_format_memory(self.buffer_size)}'
        if self.output_size:
            s += f'\n\thas output activation of size {_format_memory(self.output_size)}'
        if self.total_size:
            s += f'\n\thas total activation of size {_format_memory(self.total_size)}'
        if self.temp_size:
            s += f'\n\thas temp activation of size {_format_memory(self.temp_size)}'
        if self.backward_size:
            s += f'\n\thas backward activation of size {_format_memory(self.backward_size)}'
        s += f'\n\tfwd_flop = {self.fwd_flop}'\
            f'\n\tbwd_flop = {self.bwd_flop}'\
            f'\n\tfwd_comm = {self.fwd_comm}'\
            f'\n\tbwd_comm = {self.bwd_comm}'\
            f'\n\tto_recompute = {self.to_recompute}'\
            f'\n\tto_offload = {self.to_offload}'\
            f'\n\tsharding_spec = {self.sharding_spec}'
        return s
[analyzer] a minimal implementation of static graph analyzer (#2852) * [hotfix] meta tensor default device. * [siu] add experimental submodules to main branch. * [siu] * [siu] * [analyzer] init. * [analyzer] readme. * [analyzer] readme. * [analyzer] readme. * [analyzer] readme. * [test] add test. * Update symbolic_trace.py * mark skip tests. * try except. * try except. * try except. * s * init * init * fix * skip * skip --------- Co-authored-by: Daniel Shao <superdainiu@MININT-PVARVID.fareast.corp.microsoft.com> Co-authored-by: Daniel Shao <superdainiu@Daniels-Mac.local> 2 years ago			`from dataclasses import dataclass, field`
			`from typing import Callable, ClassVar, Dict, List, Optional, Tuple, Union`

			`import torch`
			`from torch.autograd.profiler_util import _format_memory, _format_time`
			`from torch.fx import Graph, GraphModule, Node`

			`from colossalai._analyzer.envs import MeshConfig`


			`def intersect(a, b):`
			`return {k: a[k] for k in a if k in b}`


			`def subtract(a, b):`
			`return {k: a[k] for k in a if k not in b}`


			`def union(a, b):`
			`return {a, b}`


			`def compute_size_in_bytes(elem: torch.Tensor \| Dict \| List \| Tuple \| int) -> int:`
			`"""Compute the size of a tensor or a collection of tensors in bytes.`

			`Args:`
			elem (torch.Tensor \| Dict \| List \| Tuple \| int): Arbitrary nested ``torch.Tensor`` data structure.

			`Returns:`
			`int: The size of the tensor or the collection of tensors in bytes.`
			`"""`
			`nbytes = 0`
			`if isinstance(elem, torch.Tensor):`
			`if elem.is_quantized:`
			`nbytes += elem.numel() * torch._empty_affine_quantized([], dtype=elem.dtype).element_size()`
			`else:`
			`nbytes += elem.numel() * torch.tensor([], dtype=elem.dtype).element_size()`
			`elif isinstance(elem, dict):`
			`value_list = [v for _, v in elem.items()]`
			`nbytes += compute_size_in_bytes(value_list)`
			`elif isinstance(elem, tuple) or isinstance(elem, list) or isinstance(elem, set):`
			`for e in elem:`
			`nbytes += compute_size_in_bytes(e)`
			`return nbytes`


			`@dataclass`
			`class MetaInfo:`
			`r"""`
			`The base class to store all profiling and static graph analysis information`
			`needed for auto-parallel system in Colossal-AI.`
			`============================================================================`
			`-------------------------------`
			`\| FX.Node \| <-----`
			`[input/param] are ---> \|[input/param] [grad_inp]\| [grad_inp] contributes to the`
			`placeholders (might be \| \| \__________ \| \| profiled peak memory in backward`
			`saved for backward. \| \| \ \| \| pass. [grad_param] is calculated`
			`\| \| \ \| \| separately.`
			`\| [interm] -------> [grad_int]\| <-----`
			`\| \| \_________ \| \| [grad_interm] marks the peak`
			`\| / \ \ \| \| memory in backward pass.`
			`[x] is not counted ---> \| [x] [interm] --> [grad_int]\| <-----`
			`in [interm] because \| \| \_____ \| \|`
			`it is not saved for \| \| \ \| \|`
			`backward. \| [output] \ \| \| <----- [output] is potentially`
			`------------------------------- [input] for the next node.`
			`============================================================================`

			`Accumulate Size = ALL_PREVIOUS_CTX U {Interm Size + Output Size}`
			`Output Size = ([output] in global_ctx and not is_alias)`
			`Temp Size = ([output] not in global_ctx and not is_alias)`
			`Backward Size = ([grad_inp])`

			`Usage:`
			`>>> for node in graph.nodes:`
			`>>> n_info = MetaInfo(node) # will create a new MetaInfo instance and store in node.meta['info']`
			`>>> # if not exist, otherwise return the existing one`
			`>>> n_info.to_recompute = ... # set the to_recompute attribute`

			`Remarks:`
			`This feature is experimental and all the entries are subject to change.`
			`"""`

			`# reference`
			`node: Node`

			`# directory`
			`mod_dir: str = ''`

			`# ctx[data_ptr] = Tensor`
			`# mark the storage for ctx.save_for_backward`
			`global_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {}) # globally shared`
			`curr_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {}) # global_ctx till this node`

			`# should be updated after each graph manipulation`
			`# ============================== Update ====================================`
			# parameter and buffer within ``Node``
			`parameters: Dict[str, torch.nn.Parameter] = field(default_factory=lambda: {})`
			`buffers: Dict[str, torch.Tensor] = field(default_factory=lambda: {})`

			`inputs: Tuple[torch.Tensor] = ()`
			`outputs: Tuple[torch.Tensor] = ()`
			`is_alias: Tuple[bool] = () # whether the output is an alias of input`

			`# compute cost`
			`fwd_flop: Optional[int] = 0`
			`bwd_flop: Optional[int] = 0`

			`# communication cost (should be the size in bytes of communication)`
			`fwd_comm: Optional[int] = 0`
			`bwd_comm: Optional[int] = 0`

			`# should keep the same whenever manipulated`
			`# ============================= Invariant ==================================`
			`to_recompute: Tuple[torch.Tensor] = () # (region_0, region_1, ...) support nested codegen`
			`to_offload: Optional[bool] = False`
			`sharding_spec: str = 'RR'`

			`def __new__(cls, node: Node, **kwargs):`
			`orig_init = cls.__init__`

			`# if initialized, return the existing one`
			`# should disable the __init__ function`
			`if node.meta.get('info', None) is not None:`

			`def _dummy(self, args, *kwargs):`
			`if getattr(self, '_is_init', False):`
			`self._is_init = True`
			`orig_init(self, args, *kwargs)`
			`cls.__init__ = orig_init`

			`cls.__init__ = _dummy`
			`return node.meta['info']`
			`return super().__new__(cls)`

			`def __post_init__(self):`
			`self.node.meta['info'] = self`

			`@property`
			`def fwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):`
			`return self.fwd_flop / tflops + self.fwd_comm / bandwidth`

			`@property`
			`def bwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):`
			`return self.bwd_flop / tflops + self.bwd_comm / bandwidth`

			`@property`
			`def param_size(self):`
			`return compute_size_in_bytes(self.parameters)`

			`@property`
			`def buffer_size(self):`
			`return compute_size_in_bytes(self.buffers)`

			`@property`
			`def output_size(self):`
			`"""Used in CheckpointSolver"""`
			`output_ctx = {`
			`o.data_ptr(): o`
			`for o, is_alias in zip(self.outputs, self.is_alias)`
			`if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)`
			`}`
			`return compute_size_in_bytes(intersect(self.global_ctx, output_ctx))`

			`@property`
			`def accumulate_size(self):`
			`"""Used in CheckpointSolver"""`
			`output_ctx = {`
			`o.data_ptr(): o`
			`for o, is_alias in zip(self.outputs, self.is_alias)`
			`if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)`
			`}`
			`return compute_size_in_bytes(union(self.curr_ctx, intersect(self.global_ctx, output_ctx)))`

			`@property`
			`def temp_size(self):`
			`"""Used in CheckpointSolver"""`
			`output_ctx = {`
			`o.data_ptr(): o`
			`for o, is_alias in zip(self.outputs, self.is_alias)`
			`if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)`
			`}`
			`return compute_size_in_bytes(subtract(output_ctx, self.global_ctx))`

			`@property`
			`def backward_size(self):`
			`"""Used in CheckpointSolver"""`
			`return compute_size_in_bytes(self.inputs)`

			`def __repr__(self):`
			`s = f'Node {self.node.name}'`
			`if self.parameters:`
			`s += f'\n\thas parameter of size {_format_memory(self.param_size)}'`
			`if self.buffers:`
			`s += f'\n\thas buffer of size {_format_memory(self.buffer_size)}'`
			`if self.output_size:`
			`s += f'\n\thas output activation of size {_format_memory(self.output_size)}'`
			`if self.total_size:`
			`s += f'\n\thas total activation of size {_format_memory(self.total_size)}'`
			`if self.temp_size:`
			`s += f'\n\thas temp activation of size {_format_memory(self.temp_size)}'`
			`if self.backward_size:`
			`s += f'\n\thas backward activation of size {_format_memory(self.backward_size)}'`
			`s += f'\n\tfwd_flop = {self.fwd_flop}'\`
			`f'\n\tbwd_flop = {self.bwd_flop}'\`
			`f'\n\tfwd_comm = {self.fwd_comm}'\`
			`f'\n\tbwd_comm = {self.bwd_comm}'\`
			`f'\n\tto_recompute = {self.to_recompute}'\`
			`f'\n\tto_offload = {self.to_offload}'\`
			`f'\n\tsharding_spec = {self.sharding_spec}'`
			`return s`