ColossalAI/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py

from copy import deepcopy
from typing import Any, Dict, List, Tuple

from torch import Tensor
from torch.fx import Graph, Node

from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply, runtime_comm_spec_apply
from colossalai.fx.codegen.activation_checkpoint_codegen import _find_nested_ckpt_regions
from colossalai.fx.profiler import (
    activation_size,
    calculate_bwd_time,
    calculate_fwd_out,
    calculate_fwd_time,
    calculate_fwd_tmp,
)
from colossalai.logging import get_dist_logger

from .ckpt_solver_base import CheckpointSolverBase
from .operation import Backward, Chain, ForwardCheck, ForwardEnable, ForwardNograd, Loss, Sequence

__all__ = ['CheckpointSolverRotor']


class CheckpointSolverRotor(CheckpointSolverBase):

    def __init__(self,
                 graph: Graph,
                 free_memory: float = -1,
                 cnode: List[str] = None,
                 memory_slots: int = 500,
                 optim_multiplier: float = 1.0):
        """This is the simple implementation of dynamic programming algorithm rotor
        in https://hal.inria.fr/hal-02352969. Some code are adapted from
        https://gitlab.inria.fr/hiepacs/rotor.

        Usage:
            Assume that we have a ``GraphModule``, and we have already done the extractions
            to the graph to retrieve all information needed, then we could use the following
            code to find a solution using ``CheckpointSolverRotor``:
            >>> solver = CheckpointSolverRotor(gm.graph, free_memory=torch.cuda.mem_get_info(device=0)[0])
            >>> rotor_graph = solver.solve(force_python=True)   # otherwise use C solver
            >>> gm.graph = rotor_graph    # set the graph to a new graph

        Args:
            graph (Graph): The computing graph to be optimized.
            free_memory (float, optional): Memory constraint for the solution, unit is byte.
                Use ``torch.cuda.mem_get_info(device=0)[0]`` to estimate the free_memory. Defaults to -1.
            cnode (List[str], optional): Common node List, should be the subset of input. Defaults to None.
            memory_slots (int, optional): Number of slots for discretizing memory budget. Defaults to 500.
            optim_multiplier (float, optional): The multiplier of extra weight storage for the
            ``torch.optim.Optimizer``. Default to 1.0.
        """
        super().__init__(graph, free_memory, True, cnode, optim_multiplier)
        self.memory_slots = memory_slots

        # construct chain
        unit = self.free_memory // self.memory_slots
        self.chain = self._construct_chain(self.graph, self.node_list)
        self.chain.discretize_all(unit)

        self.cost_table = None
        self.back_ptr = None
        self.sequence = None

    def solve(self, force_python: bool = False, verbose: bool = False) -> Graph:
        """Solve the checkpointing problem using rotor algorithm.

        Args:
            force_python (bool, optional): Use Python version of solver, else use C version. Defaults to False.
            verbose (bool, optional): Print verbose information. Defaults to False.

        Returns:
            graph (Graph): The optimized graph, should be a copy of the original graph.
        """
        chain = self.chain

        # compute cost table
        if force_python:
            self.cost_table, self.back_ptr = self._compute_table(chain, self.memory_slots)
        else:
            self.cost_table, self.back_ptr = self._compute_table_c(chain, self.memory_slots)

        if verbose:
            self.print_chain()

        # backtrack
        try:
            self.sequence = self._backtrack(chain, 0, len(chain), self.memory_slots - chain.x[0], self.cost_table,
                                            self.back_ptr)
            self._annotate_from_sequence(self.sequence, self.node_list)
        except ValueError as e:
            # using logger to annonce that the solver is failed
            logger = get_dist_logger()
            logger.warning(f'Checkpoint solver failed: {e}')
            raise ValueError

        if verbose:
            self.print_sequence()

        return deepcopy(self.graph)

    def print_chain(self):
        print('[input]', self.chain.x[0], self.chain.xbar[0], self.chain.ftmp[0], self.chain.btmp[0])
        for idx in range(len(self.node_list) - 1):
            print(self.node_list[idx], self.chain.x[idx + 1], self.chain.xbar[idx + 1], self.chain.ftmp[idx],
                  self.chain.btmp[idx])
        print(f'Chain = {self.chain}')

    def print_sequence(self):
        print(f'Sequence = {self.sequence}')

    @classmethod
    def _construct_chain(cls, graph: Graph, node_list: List[List[Node]]) -> Chain:
        input_tensors = cls._extract_input(graph)
        ftime, btime, ftmp, btmp = list(), list(), list(), list()
        xbar, x = [activation_size(input_tensors)], [activation_size(input_tensors)]

        for node in node_list:
            node_info = cls._extract_node_info(node)
            ftime.append(node_info[0])
            btime.append(node_info[1])
            x.append(node_info[2])
            xbar.append(node_info[3])
            ftmp.append(node_info[4])
            btmp.append(node_info[5])

        # currently we view loss backward temp as zero
        btime.append(0)
        btmp.append(0)

        return Chain(ftime, btime, x, xbar, ftmp, btmp)

    @classmethod
    def _extract_node_info(cls, node: List[Node]) -> Tuple[int, ...]:
        """Extract node info from a list of nodes"""
        xbar = 0
        ftime = 0
        btime = 0
        fwd_mem_peak = 0
        for n in node:
            assert isinstance(n, Node), f'{n} is not a Node'
            if n.target == runtime_apply or n.target == runtime_comm_spec_apply:
                # in this case we need to calculate memory usage directly based on the statics that hooked in node.meta
                xbar += n.meta['fwd_mem_out']
                fwd_mem_peak = max(fwd_mem_peak, xbar + n.meta['fwd_mem_tmp'])
            else:
                xbar += calculate_fwd_tmp(n) + calculate_fwd_out(n)
                fwd_mem_peak = max(fwd_mem_peak, xbar + n.meta['fwd_mem_tmp'] + cls._extract_unused_output(n))

            # minimum flop count is required
            ftime += max(calculate_fwd_time(n), 1.0)
            btime += max(calculate_bwd_time(n), 1.0)

        x = calculate_fwd_out(node[-1])
        xbar = max(x, xbar)
        ftmp = fwd_mem_peak - xbar
        btmp = cls._extract_btmp(node)
        return ftime, btime, x, xbar, ftmp, btmp

    @staticmethod
    def _extract_input(graph: Graph) -> Tuple[Tensor, ...]:
        """Extract input tensors from a Graph"""
        input_tensors = []
        for node in graph.nodes:
            if node.op == 'placeholder':
                input_tensors.append(node.meta['fwd_out'])
        return input_tensors

    @staticmethod
    def _extract_unused_output(node: Node) -> int:
        """Extract unused output from `torch.fx.Node`"""
        return activation_size(node.meta['fwd_out']) - calculate_fwd_out(node)

    @staticmethod
    def _extract_btmp(node: List[Node]) -> int:
        """Extract btmp from a list of nodes"""

        def _extract_deps_size():
            deps_size = 0
            for k, v in deps.items():
                k: Node
                if v > 0:
                    deps_size += k.meta['bwd_mem_out']
                if v == float('-inf'):
                    deps_size -= calculate_fwd_tmp(k) + calculate_fwd_out(k)

            return deps_size

        btmp = 0
        deps = {}
        for n in reversed(node):
            deps[n] = len(n.all_input_nodes)
            btmp = max(btmp, _extract_deps_size() + n.meta['bwd_mem_tmp'])
            for child in n.users:
                if child in deps:
                    deps[child] -= 1
                    if deps[child] <= 0:
                        deps[child] = float('-inf')    # free
        return btmp

    @staticmethod
    def _compute_table(chain: Chain, mmax: int) -> Tuple:
        """Compute the table using dynamic programming. Returns the cost table and the backtracking pointer.

        Args:
            chain (Chain): A basic linearized structure for solving the dynamic programming problem.
            mmax (int): Maximum number of memory slots.

        Returns:
            cost_table (List): cost_table[m][lhs][rhs] indicates the optimal cost of the subproblem from lhs to rhs
            with m memory slots.
            back_ptr (List): back_ptr[m][lhs][rhs] indicates the best operation at this point. It is (True,) if the optimal choice
            is a chain checkpoint, it is (False, j) if the optimal choice is a leaf checkpoint of length j
        """

        ftime = chain.ftime + [0.0]
        btime = chain.btime
        x = chain.x + [0]
        xbar = chain.xbar + [0]
        ftmp = chain.ftmp + [0]
        btmp = chain.btmp + [0]

        # Build table
        cost_table = [[{} for _ in range(len(chain) + 1)] for _ in range(mmax + 1)]
        back_ptr = [[{} for _ in range(len(chain) + 1)] for _ in range(mmax + 1)]

        # Initialize corner cases where length of sequence equals to 1, i.e. lhs == rhs
        for m in range(mmax + 1):
            for i in range(len(chain) + 1):
                limit = max(x[i + 1] + xbar[i + 1] + ftmp[i], x[i + 1] + xbar[i + 1] + btmp[i])
                if m >= limit:
                    cost_table[m][i][i] = ftime[i] + btime[i]
                else:
                    cost_table[m][i][i] = float("inf")

        # Compute tables
        for m in range(mmax + 1):
            for d in range(1, len(chain) + 1):
                for i in range(len(chain) + 1 - d):
                    idx = i + d
                    mmin = x[idx + 1] + x[i + 1] + ftmp[i]
                    if idx > i + 1:
                        mmin = max(mmin, x[idx + 1] + max(x[j] + x[j + 1] + ftmp[j] for j in range(i + 1, idx)))
                    if m < mmin:
                        cost_table[m][i][idx] = float("inf")
                    else:
                        leaf_checkpoints = [(j,
                                             sum(ftime[i:j]) + cost_table[m - x[j]][j][idx] + cost_table[m][i][j - 1])
                                            for j in range(i + 1, idx + 1)
                                            if m >= x[j]]
                        if leaf_checkpoints:
                            best_leaf = min(leaf_checkpoints, key=lambda t: t[1])
                        else:
                            best_leaf = None
                        if m >= xbar[i + 1]:
                            chain_checkpoint = cost_table[m][i][i] + cost_table[m - xbar[i + 1]][i + 1][idx]
                        else:
                            chain_checkpoint = float("inf")
                        if best_leaf and best_leaf[1] <= chain_checkpoint:
                            cost_table[m][i][idx] = best_leaf[1]
                            back_ptr[m][i][idx] = (False, best_leaf[0])
                        else:
                            cost_table[m][i][idx] = chain_checkpoint
                            back_ptr[m][i][idx] = (True,)
        return cost_table, back_ptr

    @staticmethod
    def _compute_table_c(chain: Chain, mmax: int) -> Tuple:
        try:
            from .rotorc import compute_table

        # build module if module not found
        except ModuleNotFoundError:
            import os
            import subprocess
            import sys
            logger = get_dist_logger()
            logger.info("rotorc hasn't been built! Building library...", ranks=[0])
            this_dir = os.path.dirname(os.path.abspath(__file__))
            result = subprocess.Popen(
                [
                    f"{sys.executable}", f"{os.path.join(this_dir, 'build_c_ext.py')}", "build_ext",
                    f"--build-lib={this_dir}"
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            if result.wait() == 0:
                logger.info("rotorc has been built!", ranks=[0])
                from .rotorc import compute_table
            else:
                logger.warning("rotorc built failed! Using python version!", ranks=[0])
                return CheckpointSolverRotor._compute_table(chain, mmax)
        return compute_table(chain, mmax)

    @staticmethod
    def _backtrack(chain: Chain, lhs: int, rhs: int, budget: int, cost_table: List[Any],
                   back_ptr: List[Any]) -> "Sequence":
        """Backtrack the cost table and retrieve the optimal checkpointing strategy.

        Args:
            chain (Chain): A basic linearized structure for solving the dynamic programming problem.
            lhs (int): The left index of the interval to backtrack.
            rhs (int): The right index of the interval to backtrack.
            budget (int): The memory budget for processing this interval.
            cost_table (List[Any]): See ``._compute_table()`` for definitions
            back_ptr (List[Any]): See ``._compute_table()`` for definitions

        Raises:
            ValueError: Can not process the chain.

        Returns:
            sequence (Sequence): The sequence of executing nodes with checkpoints.
        """
        if budget <= 0:
            raise ValueError(f"Can not process a chain with negative memory {budget}")
        elif cost_table[budget][lhs][rhs] == float("inf"):
            raise ValueError(f"Can not process this chain from index {lhs} to {rhs} with memory {budget}")

        sequence = Sequence()
        if rhs == lhs:
            if lhs == len(chain):
                sequence += [Loss()]
            else:
                sequence += [ForwardEnable(lhs), Backward(lhs)]
            return sequence

        if back_ptr[budget][lhs][rhs][0]:
            sequence += [
                ForwardEnable(lhs),
                CheckpointSolverRotor._backtrack(chain, lhs + 1, rhs, budget - chain.xbar[lhs + 1], cost_table,
                                                 back_ptr),
                Backward(lhs),
            ]
        else:
            best_leaf = back_ptr[budget][lhs][rhs][1]
            sequence += [ForwardCheck(lhs)]
            sequence += [ForwardNograd(k) for k in range(lhs + 1, best_leaf)]
            sequence += [
                CheckpointSolverRotor._backtrack(chain, best_leaf, rhs, budget - chain.x[best_leaf], cost_table,
                                                 back_ptr),
                CheckpointSolverRotor._backtrack(chain, lhs, best_leaf - 1, budget, cost_table, back_ptr),
            ]
        return sequence

    @staticmethod
    def _annotate_from_sequence(sequence: Sequence, node_list: List[List[Node]]):
        """Annotate the nodes in the ``node_list`` with activation checkpoint from the sequence.

        Args:
            sequence (Sequence): The sequence of executing nodes with activation checkpoint annotations.
            node_list (List[List[Node]]): The list of nodes to annotate.
        """
        op_list = sequence.list_operations()
        loss_op = next(op for op in op_list if isinstance(op, Loss))
        fwd_list = op_list[:op_list.index(loss_op)]
        bwd_list = op_list[op_list.index(loss_op) + 1:]
        ckpt_idx = 0
        in_ckpt = False
        ckpt_region = []

        # forward annotation
        for idx, op in enumerate(fwd_list, 0):
            if in_ckpt:
                if isinstance(op, ForwardNograd):
                    ckpt_region.append(idx)

                elif isinstance(op, ForwardEnable):
                    in_ckpt = False
                    for node_idx in ckpt_region:
                        for n in node_list[node_idx]:
                            n.meta['activation_checkpoint'] = [ckpt_idx]

                    ckpt_idx += 1
                    ckpt_region = []

                elif isinstance(op, ForwardCheck):
                    for node_idx in ckpt_region:
                        for n in node_list[node_idx]:
                            n.meta['activation_checkpoint'] = [ckpt_idx]

                    ckpt_idx += 1
                    ckpt_region = [idx]

            else:
                if isinstance(op, ForwardCheck):
                    in_ckpt = True
                    ckpt_region.append(idx)

        # annotate the backward if there is any nested activation checkpoint
        in_recompute = False
        for op in bwd_list:
            if in_recompute:
                if isinstance(op, ForwardNograd):
                    ckpt_region.append(op.index)

                elif isinstance(op, ForwardEnable):
                    for node_idx in ckpt_region:
                        for n in node_list[node_idx]:
                            n.meta['activation_checkpoint'].append(ckpt_idx)

                    ckpt_idx += 1
                    ckpt_region = []

                elif isinstance(op, ForwardCheck):
                    for node_idx in ckpt_region:
                        for n in node_list[node_idx]:
                            n.meta['activation_checkpoint'].append(ckpt_idx)

                    ckpt_idx += 1
                    ckpt_region = [op.index]

                elif isinstance(op, Backward):
                    for node_idx in ckpt_region:
                        for n in node_list[node_idx]:
                            n.meta['activation_checkpoint'].append(ckpt_idx)

                    in_recompute = False

            else:
                if not isinstance(op, Backward):
                    in_recompute = True
                    ckpt_idx = 0
                    ckpt_region = []
                    if isinstance(op, ForwardCheck):
                        ckpt_region.append(op.index)

        # postprocess, make sure every activation checkpoint label in the
        # same activation checkpoint region (level = 0) has the same length
        op_list = []
        for node in node_list:
            op_list += node
        ckpt_regions = _find_nested_ckpt_regions(op_list)
        for (start_idx, end_idx) in ckpt_regions:
            nested_length = max(
                len(op_list[idx].meta['activation_checkpoint']) for idx in range(start_idx, end_idx + 1))
            for idx in range(start_idx, end_idx + 1):
                op_list[idx].meta['activation_checkpoint'] += [None] * (nested_length -
                                                                        len(op_list[idx].meta['activation_checkpoint']))