|
|
|
import math
|
|
|
|
from copy import deepcopy
|
|
|
|
from typing import List, Set, Tuple
|
|
|
|
|
|
|
|
from torch.fx import Graph, Node
|
|
|
|
|
|
|
|
from colossalai.fx.profiler import calculate_fwd_in, calculate_fwd_tmp
|
|
|
|
|
|
|
|
from .ckpt_solver_base import CheckpointSolverBase
|
|
|
|
|
|
|
|
__all__ = ['CheckpointSolverChen']
|
|
|
|
|
|
|
|
|
|
|
|
class CheckpointSolverChen(CheckpointSolverBase):
|
|
|
|
|
|
|
|
def __init__(self, graph: Graph, cnode: List[str] = None, num_grids: int = 6):
|
|
|
|
"""
|
|
|
|
This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
|
|
|
|
Note that this algorithm targets at memory optimization only, using techniques in appendix A.
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
Assume that we have a ``GraphModule``, and we have already done the extractions
|
|
|
|
to the graph to retrieve all information needed, then we could use the following
|
|
|
|
code to find a solution using ``CheckpointSolverChen``:
|
|
|
|
>>> solver = CheckpointSolverChen(gm.graph)
|
|
|
|
>>> chen_graph = solver.solve()
|
|
|
|
>>> gm.graph = chen_graph # set the graph to a new graph
|
|
|
|
|
|
|
|
Args:
|
|
|
|
graph (Graph): The computing graph to be optimized.
|
|
|
|
cnode (List[str], optional): Common node List, should be the subset of input. Defaults to None.
|
|
|
|
num_grids (int, optional): Number of grids to search for b. Defaults to 6.
|
|
|
|
"""
|
|
|
|
super().__init__(graph, 0, 0, True, cnode)
|
|
|
|
self.num_grids = num_grids
|
|
|
|
|
|
|
|
def solve(self) -> Graph:
|
|
|
|
"""Solve the checkpointing problem using Algorithm 3.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
graph (Graph): The optimized graph, should be a copy of the original graph.
|
|
|
|
"""
|
|
|
|
checkpointable_op = ['call_module', 'call_method', 'call_function', 'get_attr']
|
|
|
|
ckpt = self.grid_search()
|
|
|
|
for i, seg in enumerate(ckpt):
|
|
|
|
for idx in range(*seg):
|
|
|
|
nodes = self.node_list[idx]
|
|
|
|
for n in nodes:
|
|
|
|
if n.op in checkpointable_op:
|
|
|
|
n.meta['activation_checkpoint'] = i
|
|
|
|
return deepcopy(self.graph)
|
|
|
|
|
|
|
|
def run_chen_greedy(self, b: int = 0) -> Tuple[Set, int]:
|
|
|
|
"""
|
|
|
|
This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
|
|
|
|
"""
|
|
|
|
ckpt_intv = []
|
|
|
|
temp = 0
|
|
|
|
x = 0
|
|
|
|
y = 0
|
|
|
|
prev_idx = 2
|
|
|
|
for idx, nodes in enumerate(self.node_list):
|
|
|
|
for n in nodes:
|
|
|
|
n: Node
|
|
|
|
temp += calculate_fwd_in(n) + calculate_fwd_tmp(n)
|
|
|
|
y = max(y, temp)
|
|
|
|
if temp > b and idx > prev_idx:
|
|
|
|
x += calculate_fwd_in(nodes[0])
|
|
|
|
temp = 0
|
|
|
|
ckpt_intv.append((prev_idx, idx + 1))
|
|
|
|
prev_idx = idx + 1
|
|
|
|
return ckpt_intv, math.floor(math.sqrt(x * y))
|
|
|
|
|
|
|
|
def grid_search(self) -> Set:
|
|
|
|
"""
|
|
|
|
Search ckpt strategy with b = 0, then run the allocation algorithm again with b = √xy.
|
|
|
|
Grid search over [√2/2 b, √2 b] for ``ckpt_opt`` over ``num_grids`` as in appendix A.
|
|
|
|
"""
|
|
|
|
_, b_approx = self.run_chen_greedy(0)
|
|
|
|
b_min, b_max = math.floor(b_approx / math.sqrt(2)), math.ceil(b_approx * math.sqrt(2))
|
|
|
|
b_opt = math.inf
|
|
|
|
for b in range(b_min, b_max, (b_max - b_min) // self.num_grids):
|
|
|
|
ckpt_intv, b_approx = self.run_chen_greedy(b)
|
|
|
|
if b_approx < b_opt:
|
|
|
|
b_opt = b_approx
|
|
|
|
ckpt_opt = ckpt_intv
|
|
|
|
return ckpt_opt
|