import math import sys from typing import List, Tuple from torch.fx import Node from colossalai.fx.codegen.activation_checkpoint_codegen import _find_nested_ckpt_regions from colossalai.fx.graph_module import ColoGraphModule from colossalai.fx.profiler import activation_size, calculate_fwd_out, calculate_fwd_tmp, parameter_size from colossalai.logging import get_dist_logger from .linearize import linearize from .operation import Backward, Chain, ForwardCheck, ForwardEnable, ForwardNograd, Function, Loss, Sequence # global vairable to indicate whether the solver is failed SOLVER_FAILED = False # this is the python compute table code from rotor # https://gitlab.inria.fr/hiepacs/rotor # paper link: https://hal.inria.fr/hal-02352969 def _compute_table(chain: Chain, mmax) -> Tuple: """Returns the optimal table: a tuple containing: Opt[m][lmin][lmax] with lmin = 0...chain.length and lmax = lmin...chain.length (lmax is not included) and m = 0...mmax what[m][lmin][lmax] is (True,) if the optimal choice is a chain checkpoint (False, j) if the optimal choice is a leaf checkpoint of length j The computation uses dynamic programming""" fw = chain.fweight + [0] ## forward time bw = chain.bweight ## backward time, not used cw = chain.cweight + [0] ## size of x (and of y) cbw = chain.cbweight + [0] ## size of xbar fwd_mem_tmp = chain.fwd_mem_tmp + [0] bwd_mem_tmp = chain.bwd_mem_tmp + [0] # Build table opt = [[{} for _ in range(chain.length + 1)] for _ in range(mmax + 1)] what = [[{} for _ in range(chain.length + 1)] for _ in range(mmax + 1)] # Last one is a dict because its indices go from i to l. Renumbering will wait for C implementation # Initialize borders of the tables for lmax-lmin = 0 for m in range(mmax + 1): for i in range(chain.length + 1): #lmax-lmin = 0 limit = max(cw[i + 1] + cbw[i + 1] + fwd_mem_tmp[i], cw[i + 1] + cbw[i + 1] + bwd_mem_tmp[i]) if m >= limit: ## Equation (1) opt[m][i][i] = fw[i] + bw[i] else: opt[m][i][i] = float("inf") # Compute everything for m in range(mmax + 1): for d in range(1, chain.length + 1): for i in range(chain.length + 1 - d): # for idx in range(i+1, chain.length + 1): idx = i + d mmin = cw[idx + 1] + cw[i + 1] + fwd_mem_tmp[i] if idx > i + 1: mmin = max(mmin, cw[idx + 1] + max(cw[j] + cw[j + 1] + fwd_mem_tmp[j] for j in range(i + 1, idx))) if m < mmin: opt[m][i][idx] = float("inf") else: leaf_checkpoints = [(j, sum(fw[i:j]) + opt[m - cw[j]][j][idx] + opt[m][i][j - 1]) for j in range(i + 1, idx + 1) if m >= cw[j]] if leaf_checkpoints: best_leaf = min(leaf_checkpoints, key=lambda t: t[1]) else: best_leaf = None if m >= cbw[i + 1]: chain_checkpoint = opt[m][i][i] + opt[m - cbw[i + 1]][i + 1][idx] else: chain_checkpoint = float("inf") if best_leaf and best_leaf[1] <= chain_checkpoint: opt[m][i][idx] = best_leaf[1] what[m][i][idx] = (False, best_leaf[0]) else: opt[m][i][idx] = chain_checkpoint what[m][i][idx] = (True,) return (opt, what) def _rec(chain: Chain, lmin, lmax, cmem, opt_table): """ chain : the class describing the AC graph lmin : index of the first forward to execute lmax : upper bound index of the last forward to execute (not included) cmem : number of available memory slots Return the optimal sequence of makespan Opt_hete[cmem][lmin][lmax-lmin]""" if cmem <= 0: raise ValueError("Can not process a chain with negative memory {cmem}".format(cmem=cmem)) opt, what = opt_table sequence = Sequence(Function("Persistent", lmax - lmin, cmem)) if opt[cmem][lmin][lmax] == float("inf"): # using logger to annonce that the solver is failed logger = get_dist_logger() logger.info("Can not process this chain from index {lmin} to {lmax} with memory {cmem}".format(lmin=lmin, lmax=lmax, cmem=cmem)) # set global indicater SOLVER_FAILED to True global SOLVER_FAILED SOLVER_FAILED = True return sequence if lmin == lmax: if lmin == chain.length: sequence.insert(Loss()) else: sequence.insert(ForwardEnable(lmin)) sequence.insert(Backward(lmin)) return sequence if what[cmem][lmin][lmax][0]: sequence.insert(ForwardEnable(lmin)) sequence.insert_sequence(_rec(chain, lmin + 1, lmax, cmem - chain.cbweight[lmin + 1], opt_table)) sequence.insert(Backward(lmin)) else: j = what[cmem][lmin][lmax][1] sequence.insert(ForwardCheck(lmin)) for k in range(lmin + 1, j): sequence.insert(ForwardNograd(k)) sequence.insert_sequence(_rec(chain, j, lmax, cmem - chain.cweight[j], opt_table)) sequence.insert_sequence(_rec(chain, lmin, j - 1, cmem, opt_table)) return sequence def _fwd_xbar(node: List[Node]) -> int: """Get the forward xbar of a node Args: node (List[Node]): List of torch.fx Node, indicates a node in linearized graph Returns: int: xbar size, unit Byte """ xbar = 0 for n in node: xbar += calculate_fwd_tmp(n) + calculate_fwd_out(n) return xbar def _fwd_time(node: List[Node]) -> int: """Get the foward time of a node Args: node (List[Node]): List of torch.fx Node, indicates a node in linearized graph Returns: int: foward time, extimated by flops count """ fwd_time = 0 for n in node: # minimum flop count is needed fwd_time += max(n.meta['fwd_flop'], 1) return fwd_time def _bwd_time(node: List[Node]) -> int: """Get the backward time of a node Args: node (List[Node]): List of torch.fx Node, indicates a node in linearized graph Returns: int: backward time, extimated by flops count """ bwd_time = 0 for n in node: # minimum flop count is needed bwd_time += max(n.meta['bwd_flop'], 1) return bwd_time def _get_fwd_mem_tmp(node: List[Node]) -> int: """Get the forward temp memory of a node This could be done by subtracting the saved activation from all output of a node Args: node (List[Node]): List of torch.fx Node, indicates a node in linearized graph Returns: int: forward temp memory, unit Byte """ n = node[-1] return activation_size(n.meta['fwd_out']) - calculate_fwd_out(n) def _get_bwd_mem_tmp(node: List[Node]) -> int: """Get the backward temp memory of a node Args: node (List[Node]): List of torch.fx Node, indicates a node in linearized graph Returns: int: backward temp memory, unit Byte """ def _get_deps_size(): deps_size = 0 for k, v in deps.items(): k: Node if v > 0: deps_size += k.meta['bwd_mem_out'] if v == float('-inf'): deps_size -= calculate_fwd_tmp(k) + calculate_fwd_out(k) return deps_size bwd_mem_tmp = 0 deps = {} for n in reversed(node): deps[n] = len(n.all_input_nodes) bwd_mem_tmp = max(bwd_mem_tmp, _get_deps_size() + n.meta['bwd_mem_tmp']) for child in n.users: if child in deps: deps[child] -= 1 if deps[child] <= 0: deps[child] = float('-inf') # free return bwd_mem_tmp def _construct_chain(node_list: List[List[Node]], input) -> Chain: fwd_time = [] bwd_time = [] xbar_sizes = [activation_size(input)] x_sizes = [activation_size(input)] tmp_fwd = [] tmp_bwd = [] for idx, node in enumerate(node_list): fwd_time.append(_fwd_time(node)) bwd_time.append(_bwd_time(node)) x_sizes.append(calculate_fwd_out(node[-1])) xbar_sizes.append(max(x_sizes[-1], _fwd_xbar(node))) tmp_fwd.append(_get_fwd_mem_tmp(node)) tmp_bwd.append(_get_bwd_mem_tmp(node)) bwd_time.append(0) # currently we view loss backward temp as zero tmp_bwd.append(0) return Chain(fwd_time, bwd_time, x_sizes, xbar_sizes, tmp_fwd, tmp_bwd) def _annotate_from_sequence(sequence: Sequence, node_list: List[List[Node]]): op_list = sequence.list_operations() loss_op = next(op for op in op_list if isinstance(op, Loss)) fwd_list = op_list[:op_list.index(loss_op)] bwd_list = op_list[op_list.index(loss_op) + 1:] ckpt_idx = 0 in_ckpt = False ckpt_region = [] # forward annotation for idx, op in enumerate(fwd_list, 0): if in_ckpt: if isinstance(op, ForwardNograd): ckpt_region.append(idx) elif isinstance(op, ForwardEnable): in_ckpt = False for node_idx in ckpt_region: for n in node_list[node_idx]: setattr(n, "activation_checkpoint", [ckpt_idx]) ckpt_idx += 1 ckpt_region = [] elif isinstance(op, ForwardCheck): for node_idx in ckpt_region: for n in node_list[node_idx]: setattr(n, "activation_checkpoint", [ckpt_idx]) ckpt_idx += 1 ckpt_region = [idx] else: if isinstance(op, ForwardCheck): in_ckpt = True ckpt_region.append(idx) # annotate the backward if there is any nested activation checkpoint in_recompute = False for op in bwd_list: if in_recompute: if isinstance(op, ForwardNograd): ckpt_region.append(op.index) elif isinstance(op, ForwardEnable): for node_idx in ckpt_region: for n in node_list[node_idx]: n.activation_checkpoint.append(ckpt_idx) ckpt_idx += 1 ckpt_region = [] elif isinstance(op, ForwardCheck): for node_idx in ckpt_region: for n in node_list[node_idx]: n.activation_checkpoint.append(ckpt_idx) ckpt_idx += 1 ckpt_region = [op.index] elif isinstance(op, Backward): for node_idx in ckpt_region: for n in node_list[node_idx]: n.activation_checkpoint.append(ckpt_idx) in_recompute = False else: if not isinstance(op, Backward): in_recompute = True ckpt_idx = 0 ckpt_region = [] if isinstance(op, ForwardCheck): ckpt_region.append(op.index) # postprocess, make sure every activation checkpoint label in the # same activation checkpoint region (level = 0) has the same length op_list = [] for node in node_list: op_list += node ckpt_regions = _find_nested_ckpt_regions(op_list) for (start_idx, end_idx) in ckpt_regions: nested_length = max(len(op_list[idx].activation_checkpoint) for idx in range(start_idx, end_idx + 1)) for idx in range(start_idx, end_idx + 1): op_list[idx].activation_checkpoint += [None] * (nested_length - len(op_list[idx].activation_checkpoint)) def solver_rotor(gm: ColoGraphModule, data, mem_limit: int, mem_slots: int = 500, cnode: List[str] = None, eps: float = 0.0, force_python: bool = False) -> ColoGraphModule: """solver that automatically find activation checkpoint in rotor's manner Args: gm (ColoGraphModule): ColoGraphModule generated by tracing model and MetaInfoProp. data (torch.Tensor): input data. mem_limit (int): memory budget in Byte. mem_slots (int, optional): number of slots for discretizing memory budget. Defaults to 500. cnode (List[Node], optional): common node list for linearize. Defaults to None. eps (float): epsilon for memory decay. Defaults to 0.0 force_python (bool): force to use python version of dynamic programs Returns: ColoGraphModule: annotated ColoGraphModuled with __sequence__ attribute """ # try to import C version solver if force_python is not set logger = get_dist_logger() if not force_python: try: from .dynamic_programs_C_version import persistent_compute_table CVERSION = True # build module if module not found except ModuleNotFoundError: import os import subprocess logger.info("dynamic_programs_C_version hasn't been built! Building library...", ranks=[0]) this_dir = os.path.dirname(os.path.abspath(__file__)) result = subprocess.Popen( [ f"{sys.executable}", f"{os.path.join(this_dir, 'build_c_ext.py')}", "build_ext", f"--build-lib={this_dir}" ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if result.wait() == 0: logger.info("dynamic_programs_C_version has been built!", ranks=[0]) from .dynamic_programs_C_version import persistent_compute_table CVERSION = True else: logger.info("dynamic_programs_C_version built failed! Using python version!", ranks=[0]) CVERSION = False else: CVERSION = False # check if metainfoprop is done if any(len(node.meta) == 0 for node in gm.graph.nodes): raise RuntimeError( "Nodes meta information hasn't been prepared! Please run MetaInfoProp before calling solver!") # linearize the graph node_list = linearize(gm, cnode) # construct chain mem_unit = mem_limit * (1.0 - eps) // mem_slots chain: Chain = _construct_chain(node_list, data) chain._discretize(mem_unit) # use C version if possible if CVERSION and not force_python: logger.info("Using C version rotor solver!", ranks=[0]) opt_table = persistent_compute_table(chain, mem_slots) else: opt_table = _compute_table(chain, mem_slots) logger.info("Using python version rotor solver!", ranks=[0]) # found sequence sequence = _rec(chain, 0, chain.length, mem_slots - chain.cweight[0], opt_table) # if solver failed, we don't need to annotate the graph if not SOLVER_FAILED: _annotate_from_sequence(sequence, node_list) # set __sequence__ attribute to GraphModule if SOLVER_FAILED: setattr(gm, "__sequence__", None) else: setattr(gm, "__sequence__", sequence) # set __opttable__ attribute to GraphModule setattr(gm, "__opttable__", opt_table[0]) gm.recompile() return gm