ColossalAI/colossalai/_analyzer/fx/node_util.py

from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union

import torch
from torch.autograd.profiler_util import _format_memory
from torch.fx import Node

from colossalai._analyzer.envs import MeshConfig


def intersect(a, b):
    return {k: a[k] for k in a if k in b}


def subtract(a, b):
    return {k: a[k] for k in a if k not in b}


def union(a, b):
    return {**a, **b}


def compute_size_in_bytes(elem: Union[torch.Tensor, Dict, List, Tuple, int]) -> int:
    """Compute the size of a tensor or a collection of tensors in bytes.

    Args:
        elem (torch.Tensor | Dict | List | Tuple | int): Arbitrary nested ``torch.Tensor`` data structure.

    Returns:
        int: The size of the tensor or the collection of tensors in bytes.
    """
    nbytes = 0
    if isinstance(elem, torch.Tensor):
        if elem.is_quantized:
            nbytes += elem.numel() * torch._empty_affine_quantized([], dtype=elem.dtype).element_size()
        else:
            nbytes += elem.numel() * torch.tensor([], dtype=elem.dtype).element_size()
    elif isinstance(elem, dict):
        value_list = [v for _, v in elem.items()]
        nbytes += compute_size_in_bytes(value_list)
    elif isinstance(elem, tuple) or isinstance(elem, list) or isinstance(elem, set):
        for e in elem:
            nbytes += compute_size_in_bytes(e)
    return nbytes


@dataclass
class MetaInfo:
    r"""
    The base class to store all profiling and static graph analysis information
    needed for auto-parallel system in Colossal-AI.
    ============================================================================
                            -------------------------------
                            |          FX.Node            |    <-----
    [input/param] are  ---> |[input/param]      [grad_inp]|    [grad_inp] contributes to the
    placeholders (might be  |     | \__________     |     |    profiled peak memory in backward
    saved for backward.     |     |            \    |     |    pass. [grad_param] is calculated
                            |     |             \   |     |    separately.
                            | [interm] -------> [grad_int]|    <-----
                            |     |  \_________     |     |    [grad_interm] marks the peak
                            |    / \           \    |     |    memory in backward pass.
    [x] is not counted ---> | [x]  [interm] --> [grad_int]|    <-----
    in [interm] because     |          |  \_____    |     |
    it is not saved for     |          |        \   |     |
    backward.               |      [output]      \  |     |    <----- [output] is potentially
                            -------------------------------    [input] for the next node.
    ============================================================================

    Accumulate Size = ALL_PREVIOUS_CTX U {Interm Size + Output Size}
    Output Size = ([output] in global_ctx and not is_alias)
    Temp Size = ([output] not in global_ctx and not is_alias)
    Backward Size = ([grad_inp])

    Usage:
        >>> for node in graph.nodes:
        >>>     n_info = MetaInfo(node)     # will create a new MetaInfo instance and store in node.meta['info']
        >>>                                 # if not exist, otherwise return the existing one
        >>>     n_info.to_recompute = ...   # set the to_recompute attribute

    Remarks:
        This feature is experimental and all the entries are subject to change.
    """

    # reference
    node: Node

    # directory
    mod_dir: str = ""

    # ctx[data_ptr] = Tensor
    # mark the storage for ctx.save_for_backward
    global_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {})  # globally shared
    curr_ctx: Dict[str, torch.Tensor] = field(default_factory=lambda: {})  # global_ctx till this node

    # should be updated after each graph manipulation
    # ============================== Update ====================================
    # parameter and buffer within ``Node``
    parameters: Dict[str, torch.nn.Parameter] = field(default_factory=lambda: {})
    buffers: Dict[str, torch.Tensor] = field(default_factory=lambda: {})

    inputs: Tuple[torch.Tensor] = ()
    outputs: Tuple[torch.Tensor] = ()
    is_alias: Tuple[bool] = ()  # whether the output is an alias of input

    # compute cost
    fwd_flop: Optional[int] = 0
    bwd_flop: Optional[int] = 0

    # communication cost (should be the size in bytes of communication)
    fwd_comm: Optional[int] = 0
    bwd_comm: Optional[int] = 0

    # should keep the same whenever manipulated
    # ============================= Invariant ==================================
    activation_checkpoint: Tuple[torch.Tensor] = ()  # (region_0, region_1, ...) support nested codegen
    to_offload: Optional[bool] = False
    sharding_spec: str = "RR"

    def __new__(cls, node: Node, **kwargs):
        orig_init = cls.__init__

        # if initialized, return the existing one
        # should disable the __init__ function
        if node.meta.get("info", None) is not None:

            def _dummy(self, *args, **kwargs):
                if getattr(self, "_is_init", False):
                    self._is_init = True
                    orig_init(self, *args, **kwargs)
                cls.__init__ = orig_init

            cls.__init__ = _dummy
            return node.meta["info"]
        return super().__new__(cls)

    def __post_init__(self):
        self.node.meta["info"] = self

    @property
    def fwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):
        return self.fwd_flop / tflops + self.fwd_comm / bandwidth

    @property
    def bwd_time(self, tflops: float = MeshConfig.TFLOPS, bandwidth: float = MeshConfig.BANDWIDTH):
        return self.bwd_flop / tflops + self.bwd_comm / bandwidth

    @property
    def param_size(self):
        return compute_size_in_bytes(self.parameters)

    @property
    def buffer_size(self):
        return compute_size_in_bytes(self.buffers)

    @property
    def output_size(self):
        """Used in CheckpointSolver"""
        output_ctx = {
            o.data_ptr(): o
            for o, is_alias in zip(self.outputs, self.is_alias)
            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
        }
        return compute_size_in_bytes(intersect(self.global_ctx, output_ctx))

    @property
    def accumulate_size(self):
        """Used in CheckpointSolver"""
        output_ctx = {
            o.data_ptr(): o
            for o, is_alias in zip(self.outputs, self.is_alias)
            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
        }
        return compute_size_in_bytes(union(self.curr_ctx, intersect(self.global_ctx, output_ctx)))

    @property
    def temp_size(self):
        """Used in CheckpointSolver"""
        output_ctx = {
            o.data_ptr(): o
            for o, is_alias in zip(self.outputs, self.is_alias)
            if not is_alias and isinstance(o, torch.Tensor) and not isinstance(o, torch.nn.Parameter)
        }
        return compute_size_in_bytes(subtract(output_ctx, self.global_ctx))

    @property
    def backward_size(self):
        """Used in CheckpointSolver"""
        return compute_size_in_bytes(self.inputs)

    def __repr__(self):
        s = f"Node {self.node.name}"
        if self.parameters:
            s += f"\n\thas parameter of size {_format_memory(self.param_size)}"
        if self.buffers:
            s += f"\n\thas buffer of size {_format_memory(self.buffer_size)}"
        if self.output_size:
            s += f"\n\thas output activation of size {_format_memory(self.output_size)}"
        # if self.total_size:
        #     s += f'\n\thas total activation of size {_format_memory(self.total_size)}'
        if self.temp_size:
            s += f"\n\thas temp activation of size {_format_memory(self.temp_size)}"
        if self.backward_size:
            s += f"\n\thas backward activation of size {_format_memory(self.backward_size)}"
        s += (
            f"\n\tfwd_flop = {self.fwd_flop}"
            f"\n\tbwd_flop = {self.bwd_flop}"
            f"\n\tfwd_comm = {self.fwd_comm}"
            f"\n\tbwd_comm = {self.bwd_comm}"
            f"\n\tto_recompute = {self.to_recompute}"
            f"\n\tto_offload = {self.to_offload}"
            f"\n\tsharding_spec = {self.sharding_spec}"
        )
        return s