ColossalAI/colossalai/auto_parallel/passes/meta_info_prop.py

import uuid
from dataclasses import asdict
from typing import List

import torch
import torch.fx
from torch.fx import GraphModule
from torch.fx.node import Node

from colossalai.auto_parallel.meta_profiler import ShardMetaInfo
from colossalai.auto_parallel.passes.constants import OUTPUT_SAVED_MOD, OUTPUT_SAVED_OPS
from colossalai.fx._compatibility import compatibility
from colossalai.fx.profiler import GraphInfo


def _normalize_tuple(x):
    if not isinstance(x, tuple):
        return (x,)
    return x


@compatibility(is_backward_compatible=False)
class MetaInfoProp:

    def __init__(self, module: GraphModule) -> None:
        self.module = module
        self.func_dict = {
            'placeholder': self.placeholder_handler,
            'get_attr': self.get_attr_handler,
            'output': self.output_handler,
            'call_function': self.node_handler,
            'call_module': self.node_handler,
            'call_method': self.node_handler,
        }

    def _set_data_ptr(self, x):
        """
        Set uuid to tensor
        """
        if isinstance(x, torch.Tensor):
            if not x.data_ptr():
                data_ptr = uuid.uuid4()
                x.data_ptr = lambda: data_ptr

    def _is_inplace(self, node: Node):
        """
        Check if the node is inplace operation.
        """
        if node.op == 'call_module':
            return node.graph.owning_module.get_submodule(node.target).__class__ in OUTPUT_SAVED_MOD
        elif node.op == "call_function":
            return node.target in OUTPUT_SAVED_OPS
        return False

    def run(self) -> GraphModule:
        """
        Run the meta information propagation pass on the module.
        """
        for node in self.module.graph.nodes:
            node: Node
            self.func_dict[node.op](node)

    @compatibility(is_backward_compatible=False)
    def placeholder_handler(self, node: Node) -> None:
        """
        Handle the placeholder node.
        """
        graph_info = GraphInfo()
        out = _normalize_tuple(getattr(node, '_meta_data', None))
        graph_info.fwd_out = list(out) if out[0] is not None else []
        node.meta = {**asdict(graph_info)}

    @compatibility(is_backward_compatible=False)
    def get_attr_handler(self, node: Node) -> None:
        """
        Handle the get_attr node.
        """
        graph_info = GraphInfo()
        node.meta = {**asdict(graph_info)}

    @compatibility(is_backward_compatible=False)
    def output_handler(self, node: Node) -> None:
        """
        Handle the output node.
        """
        graph_info = GraphInfo()
        output_tensors = []
        for par in node._input_nodes:
            if par.meta:
                output_tensors += par.meta["fwd_out"]
        graph_info.fwd_in = output_tensors
        node.meta = {**asdict(graph_info)}

    @compatibility(is_backward_compatible=False)
    def node_handler(self, node: Node) -> None:
        """
        Handle other kind of nodes
        """
        assert hasattr(node, 'best_strategy_info'), f"Cannot find best_strategy_info in node {node}, {node.op}"
        graph_info = GraphInfo()
        meta_info = node.best_strategy_info
        meta_info: ShardMetaInfo

        # set data_ptr for input_tensor in ShardMetaInfo class
        input_tensors: List[torch.Tensor] = meta_info.fwd_in
        buffer_tensors: List[torch.Tensor] = meta_info.fwd_buffer
        output_tensors: List[torch.Tensor] = meta_info.fwd_out

        if self._is_inplace(node):
            # inplace operation will not create new tensor, and it only has one parent node
            # TODO: Verify this observation
            # set data_ptr for input_tensor, buffer_tensor and output_tensor of current node
            parent_node = list(node._input_nodes.keys())[0]
            parent_tensor = parent_node.meta.get("fwd_out")[0]
            parent_tensor: torch.Tensor
            for tensor in input_tensors:
                tensor.data_ptr = parent_tensor.data_ptr
            for tensor in buffer_tensors:
                tensor.data_ptr = parent_tensor.data_ptr
            for tensor in output_tensors:
                tensor.data_ptr = parent_tensor.data_ptr

        else:
            for par in node._input_nodes:
                # set data_ptr for the input_tensor of current node from the output_tensor of its parent node
                for tensor in par.meta.get("fwd_out", []):
                    tensor: torch.Tensor
                    target_input_tensor = next(
                        (x for x in input_tensors if not x.data_ptr() and x.shape == tensor.shape), None)
                    if target_input_tensor is not None:
                        target_input_tensor.data_ptr = tensor.data_ptr

            # set data_ptr for tensor in input_tensor that is not set
            for tensor in input_tensors:
                if not tensor.data_ptr():
                    self._set_data_ptr(tensor)

            # set data_ptr for buffer_tensor
            for tensor in buffer_tensors:
                self._set_data_ptr(tensor)

            # set data_ptr for output_tensor
            for tensor in output_tensors:
                self._set_data_ptr(tensor)

        # attach them to graph_info
        graph_info.fwd_in = input_tensors
        graph_info.fwd_tmp = buffer_tensors
        graph_info.fwd_out = output_tensors

        # fetch other memory information
        memory_cost = meta_info.memory_cost
        graph_info.fwd_mem_tmp = memory_cost.fwd.temp
        graph_info.fwd_mem_out = memory_cost.fwd.activation
        graph_info.bwd_mem_tmp = memory_cost.bwd.temp
        graph_info.bwd_mem_out = memory_cost.bwd.activation

        # fetch flop information
        # here we use fwd_time and bwd_time to deal with the case that
        # communication cost is a float
        compute_cost = meta_info.compute_cost
        graph_info.fwd_time = compute_cost.fwd
        graph_info.bwd_time = compute_cost.bwd

        node.meta = {**asdict(graph_info)}
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`import uuid`
			`from dataclasses import asdict`
[autoparallel] Hook all meta information on ResNet nodes for auto activation checkpoint (#2248) * [autoparallel] hook node meta on graph nodes for checkpoint solver * [autoparallel] polish code * [autoparallel] restore some node handlers * colossalai/auto_parallel/passes/meta_info_prop.py * [autoparallel] remove some unused import * [autoparallel] hook bwd_mem_out 2023-01-02 08:25:18 +00:00			`from typing import List`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
			`import torch`
			`import torch.fx`
			`from torch.fx import GraphModule`
[autoparallel] Hook all meta information on ResNet nodes for auto activation checkpoint (#2248) * [autoparallel] hook node meta on graph nodes for checkpoint solver * [autoparallel] polish code * [autoparallel] restore some node handlers * colossalai/auto_parallel/passes/meta_info_prop.py * [autoparallel] remove some unused import * [autoparallel] hook bwd_mem_out 2023-01-02 08:25:18 +00:00			`from torch.fx.node import Node`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
[autoparallel]integrate auto parallel feature with new tracer (#3408) * [autoparallel] integrate new analyzer in module level * unify the profiling method * polish * fix no codegen bug * fix pass bug * fix liveness test * polish 2023-04-04 09:40:45 +00:00			`from colossalai.auto_parallel.meta_profiler import ShardMetaInfo`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`from colossalai.auto_parallel.passes.constants import OUTPUT_SAVED_MOD, OUTPUT_SAVED_OPS`
[autoparallel] Hook all meta information on ResNet nodes for auto activation checkpoint (#2248) * [autoparallel] hook node meta on graph nodes for checkpoint solver * [autoparallel] polish code * [autoparallel] restore some node handlers * colossalai/auto_parallel/passes/meta_info_prop.py * [autoparallel] remove some unused import * [autoparallel] hook bwd_mem_out 2023-01-02 08:25:18 +00:00			`from colossalai.fx._compatibility import compatibility`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`from colossalai.fx.profiler import GraphInfo`


			`def _normalize_tuple(x):`
			`if not isinstance(x, tuple):`
			`return (x,)`
			`return x`


			`@compatibility(is_backward_compatible=False)`
			`class MetaInfoProp:`

			`def __init__(self, module: GraphModule) -> None:`
			`self.module = module`
			`self.func_dict = {`
			`'placeholder': self.placeholder_handler,`
			`'get_attr': self.get_attr_handler,`
			`'output': self.output_handler,`
			`'call_function': self.node_handler,`
			`'call_module': self.node_handler,`
			`'call_method': self.node_handler,`
			`}`

			`def _set_data_ptr(self, x):`
			`"""`
			`Set uuid to tensor`
			`"""`
			`if isinstance(x, torch.Tensor):`
			`if not x.data_ptr():`
			`data_ptr = uuid.uuid4()`
			`x.data_ptr = lambda: data_ptr`

			`def _is_inplace(self, node: Node):`
			`"""`
			`Check if the node is inplace operation.`
			`"""`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`if node.op == 'call_module':`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`return node.graph.owning_module.get_submodule(node.target).__class__ in OUTPUT_SAVED_MOD`
			`elif node.op == "call_function":`
			`return node.target in OUTPUT_SAVED_OPS`
			`return False`

			`def run(self) -> GraphModule:`
			`"""`
			`Run the meta information propagation pass on the module.`
			`"""`
			`for node in self.module.graph.nodes:`
			`node: Node`
			`self.func_dict[node.op](node)`

			`@compatibility(is_backward_compatible=False)`
			`def placeholder_handler(self, node: Node) -> None:`
			`"""`
			`Handle the placeholder node.`
			`"""`
			`graph_info = GraphInfo()`
			`out = _normalize_tuple(getattr(node, '_meta_data', None))`
[autoparallel] Hook all meta information on ResNet nodes for auto activation checkpoint (#2248) * [autoparallel] hook node meta on graph nodes for checkpoint solver * [autoparallel] polish code * [autoparallel] restore some node handlers * colossalai/auto_parallel/passes/meta_info_prop.py * [autoparallel] remove some unused import * [autoparallel] hook bwd_mem_out 2023-01-02 08:25:18 +00:00			`graph_info.fwd_out = list(out) if out[0] is not None else []`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`node.meta = {**asdict(graph_info)}`

			`@compatibility(is_backward_compatible=False)`
			`def get_attr_handler(self, node: Node) -> None:`
			`"""`
			`Handle the get_attr node.`
			`"""`
			`graph_info = GraphInfo()`
			`node.meta = {**asdict(graph_info)}`

			`@compatibility(is_backward_compatible=False)`
			`def output_handler(self, node: Node) -> None:`
			`"""`
			`Handle the output node.`
			`"""`
			`graph_info = GraphInfo()`
			`output_tensors = []`
			`for par in node._input_nodes:`
			`if par.meta:`
			`output_tensors += par.meta["fwd_out"]`
			`graph_info.fwd_in = output_tensors`
			`node.meta = {**asdict(graph_info)}`

			`@compatibility(is_backward_compatible=False)`
			`def node_handler(self, node: Node) -> None:`
			`"""`
			`Handle other kind of nodes`
			`"""`
[autoparallel]integrate auto parallel feature with new tracer (#3408) * [autoparallel] integrate new analyzer in module level * unify the profiling method * polish * fix no codegen bug * fix pass bug * fix liveness test * polish 2023-04-04 09:40:45 +00:00			`assert hasattr(node, 'best_strategy_info'), f"Cannot find best_strategy_info in node {node}, {node.op}"`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`graph_info = GraphInfo()`
[autoparallel]integrate auto parallel feature with new tracer (#3408) * [autoparallel] integrate new analyzer in module level * unify the profiling method * polish * fix no codegen bug * fix pass bug * fix liveness test * polish 2023-04-04 09:40:45 +00:00			`meta_info = node.best_strategy_info`
			`meta_info: ShardMetaInfo`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
[autoparallel]integrate auto parallel feature with new tracer (#3408) * [autoparallel] integrate new analyzer in module level * unify the profiling method * polish * fix no codegen bug * fix pass bug * fix liveness test * polish 2023-04-04 09:40:45 +00:00			`# set data_ptr for input_tensor in ShardMetaInfo class`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`input_tensors: List[torch.Tensor] = meta_info.fwd_in`
			`buffer_tensors: List[torch.Tensor] = meta_info.fwd_buffer`
			`output_tensors: List[torch.Tensor] = meta_info.fwd_out`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`if self._is_inplace(node):`
			`# inplace operation will not create new tensor, and it only has one parent node`
			`# TODO: Verify this observation`
			`# set data_ptr for input_tensor, buffer_tensor and output_tensor of current node`
			`parent_node = list(node._input_nodes.keys())[0]`
			`parent_tensor = parent_node.meta.get("fwd_out")[0]`
			`parent_tensor: torch.Tensor`
			`for tensor in input_tensors:`
			`tensor.data_ptr = parent_tensor.data_ptr`
			`for tensor in buffer_tensors:`
			`tensor.data_ptr = parent_tensor.data_ptr`
			`for tensor in output_tensors:`
			`tensor.data_ptr = parent_tensor.data_ptr`

			`else:`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`for par in node._input_nodes:`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`# set data_ptr for the input_tensor of current node from the output_tensor of its parent node`
			`for tensor in par.meta.get("fwd_out", []):`
			`tensor: torch.Tensor`
			`target_input_tensor = next(`
			`(x for x in input_tensors if not x.data_ptr() and x.shape == tensor.shape), None)`
			`if target_input_tensor is not None:`
			`target_input_tensor.data_ptr = tensor.data_ptr`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
			`# set data_ptr for tensor in input_tensor that is not set`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`for tensor in input_tensors:`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`if not tensor.data_ptr():`
			`self._set_data_ptr(tensor)`

			`# set data_ptr for buffer_tensor`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`for tensor in buffer_tensors:`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`self._set_data_ptr(tensor)`

			`# set data_ptr for output_tensor`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00			`for tensor in output_tensors:`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`self._set_data_ptr(tensor)`
[autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline (#2261) 2023-01-03 02:30:15 +00:00
			`# attach them to graph_info`
			`graph_info.fwd_in = input_tensors`
			`graph_info.fwd_tmp = buffer_tensors`
			`graph_info.fwd_out = output_tensors`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
fix typo colossalai/auto_parallel autochunk fx/passes etc. (#3808) 2023-05-24 01:01:50 +00:00			`# fetch other memory information`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`memory_cost = meta_info.memory_cost`
			`graph_info.fwd_mem_tmp = memory_cost.fwd.temp`
[autoparallel] modify comm nodes' memory cost in construct chain (#2263) * [autoparallel] align the data_ptr with the old version of auto activation checkpoint pipeline * [autoparallel] using fwd_time and bwd_time instead of fwd_flop and bwd_flop * [autoparallel] specifycomm nodes' memory cost in construct chain 2023-01-03 03:38:48 +00:00			`graph_info.fwd_mem_out = memory_cost.fwd.activation`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00			`graph_info.bwd_mem_tmp = memory_cost.bwd.temp`
[autoparallel] Hook all meta information on ResNet nodes for auto activation checkpoint (#2248) * [autoparallel] hook node meta on graph nodes for checkpoint solver * [autoparallel] polish code * [autoparallel] restore some node handlers * colossalai/auto_parallel/passes/meta_info_prop.py * [autoparallel] remove some unused import * [autoparallel] hook bwd_mem_out 2023-01-02 08:25:18 +00:00			`graph_info.bwd_mem_out = memory_cost.bwd.activation`

			`# fetch flop information`
			`# here we use fwd_time and bwd_time to deal with the case that`
			`# communication cost is a float`
			`compute_cost = meta_info.compute_cost`
			`graph_info.fwd_time = compute_cost.fwd`
			`graph_info.bwd_time = compute_cost.bwd`
[autoparallel] new metainfoprop based on metainfo class (#2179) * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] new metainfoprop to combine SPMD solver and checkpoint solver * [autoparallel] modify placeholder handler * [autoparallel] modify metainfoprop * [autoparallel] fix function typo * [autoparallel] fix placeholder handler 2022-12-28 05:35:08 +00:00
			`node.meta = {**asdict(graph_info)}`