ColossalAI/colossalai/fx/passes/adding_split_node_pass.py

import numpy as np
import torch
import tqdm
from torch.fx import symbolic_trace
from torch.fx.node import Node

from colossalai.fx.passes.split_module import split_module


def pipe_split():
    pass


def block_split():
    pass


# Construct blocks with the condition that (block_flops / total_flops) >= limit.
def construct_blocks(gm: torch.fx.GraphModule, limit=0.01):
    total_fwd_flop = 0
    total_bwd_flop = 0
    for node in gm.graph.nodes:
        total_fwd_flop += node.fwd_flop
        total_bwd_flop += node.bwd_flop

    total_flop = total_fwd_flop + total_bwd_flop
    per_block_flop = total_flop * limit
    accumulate_fwd_flop = 0
    accumulate_bwd_flop = 0
    block_nodes = []
    for node in gm.graph.nodes:
        if 'block_split' in node.name:
            continue
        accumulate_fwd_flop += node.fwd_flop
        accumulate_bwd_flop += node.bwd_flop
        if accumulate_fwd_flop + accumulate_bwd_flop >= per_block_flop:
            with gm.graph.inserting_after(node):
                block_node = gm.graph.create_node('call_function', block_split)
                setattr(block_node, 'fwd_flop', accumulate_fwd_flop)
                setattr(block_node, 'bwd_flop', accumulate_bwd_flop)
            accumulate_fwd_flop = 0
            accumulate_bwd_flop = 0
            block_nodes.append(block_node)

    return block_nodes


def remove_blocks(gm: torch.fx.GraphModule):
    for node in gm.graph.nodes:
        if (node.op, node.target) == ('call_function', block_split):
            gm.graph.erase_node(node)


def get_compute_costs(node_list):
    num_nodes = len(node_list)
    all_compute_cost = np.full((num_nodes, num_nodes), np.inf, dtype=np.float64)

    for start in tqdm.tqdm(range(num_nodes), desc='start pos', position=0):
        for end in tqdm.tqdm(range(start, num_nodes), desc='end pos', position=1, leave=False):
            selected_flops = [(node_list[i].fwd_flop + node_list[i].bwd_flop) for i in range(start, end + 1)]
            all_compute_cost[start, end] = sum(selected_flops)

    return all_compute_cost


def do_dp_split_gpipe_impl(num_nodes, num_stages, num_microbatches, compute_costs, max_compute_cost):
    """The core implementation of the DP algorithm."""
    # Adapted from Alpa DP Formulation.
    # For f, node ID start from 0
    # f[number of stages,
    #   node id that is currently being considered]

    # record time cost(assess by fwd+bwd flop now)
    f = np.full((num_stages + 1, num_nodes + 1), np.inf, dtype=np.float32)

    # record max stage compute cost among all stages in this partition.
    f_stage_max = np.full((num_stages + 1, num_nodes + 1), 0.0, dtype=np.float32)
    # record start node index for next stage in this partition
    f_argmin = np.full((num_stages + 1, num_nodes + 1), -1, dtype=np.int32)
    f[0, num_nodes] = 0
    for s in tqdm.tqdm(range(1, num_stages + 1), desc='stage', position=2, leave=False):    # pylint: disable=too-many-nested-blocks
        for i in tqdm.tqdm(range(num_nodes - 1, -1, -1), desc='start node', position=3, leave=False):
            for k in tqdm.tqdm(range(num_nodes, i, -1), desc='mid node', position=4, leave=False):
                stage_cost = compute_costs[i, k - 1]
                new_cost = f[s - 1, k] + stage_cost
                if (stage_cost <= max_compute_cost and new_cost < f[s, i]):
                    f[s, i] = new_cost
                    f_stage_max[s, i] = max(f_stage_max[s - 1, k], stage_cost)
                    f_argmin[s, i] = k

    best_total_cost = f[num_stages, 0]
    if np.isinf(best_total_cost):
        return np.inf, None

    total_cost = f[num_stages, 0] + (num_microbatches - 1) * f_stage_max[num_stages, 0]

    current_s = num_stages
    current_node = 0

    res = []
    while current_s > 0 and current_node < num_nodes:
        next_start_node = f_argmin[current_s, current_node]
        res.append((current_node, next_start_node))
        current_s -= 1
        current_node = next_start_node

    return total_cost, res


def do_dp_split_gpipe(node_list, compute_costs, num_stages: int, num_microbatches: int):
    # Ignore the memory cost profiling in Alpa's design for convenience.
    max_compute_costs = np.sort(np.unique(compute_costs))
    best_cost = np.inf
    best_solution = None
    last_max_compute_cost = 0.0
    gap = 1e6    # temporary magic number, unit: flops

    for max_compute_cost in tqdm.tqdm(max_compute_costs):
        # Pruning to reduce search space.
        if max_compute_cost * num_microbatches >= best_cost:
            break
        if max_compute_cost - last_max_compute_cost < gap:
            continue

        cost, solution = do_dp_split_gpipe_impl(len(node_list), num_stages, num_microbatches, compute_costs,
                                                max_compute_cost)

        if cost < best_cost:
            best_cost = cost
            best_solution = solution
        last_max_compute_cost = max_compute_cost
    return best_cost, best_solution


# Auto DP partition based on Alpa.
# Adapted to Gpipe Scheduler
# split_mode:
#   'node': fx_node
#   'block': many fx_nodes construct a block
def gpipe_dp_split_pass(gm: torch.fx.GraphModule, pp_size: int, num_microbatches: int, mode='block', block_limit=0.01):
    assert mode in ['node', 'block']

    # nodes or blocks will be used in partition.
    node_list = []
    if mode == 'node':
        for node in gm.graph.nodes:
            node_list.append(node)
    elif mode == 'block':
        node_list = construct_blocks(gm, limit=block_limit)
    else:
        pass

    compute_costs = get_compute_costs(node_list)

    best_cost, best_solution = do_dp_split_gpipe(node_list, compute_costs, pp_size, num_microbatches)

    for (_, next_start_node) in best_solution:
        if pp_size <= 1:
            break
        node = node_list[next_start_node]
        with gm.graph.inserting_before(node):
            split_node = gm.graph.create_node('call_function', pipe_split)
        pp_size -= 1

    # remove block node if possible
    if mode == 'block':
        remove_blocks(gm)

    gm.recompile()
    return gm


def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    """
    In avgcompute_split_pass, we split module by the fwd flops.
    """
    mod_graph = gm.graph
    # To use avgcompute_split_pass, we need run meta_info_prop interpreter first.
    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
    check_node = list(mod_graph.nodes)[0]
    if 'tensor_meta' not in check_node.meta:
        return balanced_split_pass(gm, pp_size)

    total_fwd_flop = 0
    for node in mod_graph.nodes:
        total_fwd_flop += node.fwd_flop

    partition_flop = total_fwd_flop // pp_size
    accumulate_fwd_flop = 0
    for node in mod_graph.nodes:
        if pp_size <= 1:
            break
        if 'pipe_split' in node.name:
            continue
        accumulate_fwd_flop += node.fwd_flop
        if accumulate_fwd_flop >= partition_flop:
            total_fwd_flop = total_fwd_flop - accumulate_fwd_flop
            accumulate_fwd_flop = 0
            pp_size -= 1
            partition_flop = total_fwd_flop // pp_size
            with mod_graph.inserting_after(node):
                split_node = mod_graph.create_node('call_function', pipe_split)
    gm.recompile()
    return gm


def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    """
    In avgnode_split_pass, simply split graph by node number.
    """
    mod_graph = gm.graph
    avg_num_node = len(mod_graph.nodes) // pp_size
    accumulate_num_node = 0
    for node in mod_graph.nodes:
        if pp_size <= 1:
            break
        accumulate_num_node += 1
        if accumulate_num_node >= avg_num_node:
            accumulate_num_node = 0
            pp_size -= 1
            if node.next.op == 'output':
                with mod_graph.inserting_before(node):
                    split_node = mod_graph.create_node('call_function', pipe_split)
            else:
                with mod_graph.inserting_after(node):
                    split_node = mod_graph.create_node('call_function', pipe_split)
    gm.recompile()
    return gm


def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    """
    In balanced_split_pass, we split module by the size of parameters(weights+bias).
    """
    mod_graph = gm.graph
    total_param_amount = 0
    for param in mod_graph.owning_module.parameters():
        total_param_amount += param.numel()
    params_per_partition = total_param_amount // pp_size
    accumulate_param_amount = 0
    for node in mod_graph.nodes:
        if pp_size <= 1:
            break
        if node.op == "call_module":
            target_module = node.graph.owning_module.get_submodule(node.target)
            for param in target_module.parameters():
                accumulate_param_amount += param.numel()
        if accumulate_param_amount >= params_per_partition:
            accumulate_param_amount = 0
            pp_size -= 1
            # If the next node is output node, we will insert split annotation before
            # node to make sure there is at least one node in last partition.
            if node.next.op == 'output':
                with mod_graph.inserting_before(node):
                    split_node = mod_graph.create_node('call_function', pipe_split)
            else:
                with mod_graph.inserting_after(node):
                    split_node = mod_graph.create_node('call_function', pipe_split)
    if pp_size > 1:
        node_counter = 0
        for node in mod_graph.nodes:
            if pp_size <= 1:
                break
            if node.op == 'placeholder':
                continue
            elif node_counter == 0:
                node_counter += 1
            else:
                pp_size -= 1
                node_counter = 0
                with mod_graph.inserting_before(node):
                    split_node = mod_graph.create_node('call_function', pipe_split)

    gm.recompile()
    return gm


def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
    """
    In balanced_split_pass_v12, we split module by the size of nodes(weights+bias+outputs).
    """
    mod_graph = gm.graph
    # To use balanced_split_pass_v2, we need run meta_info_prop interpreter first.
    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
    check_node = list(mod_graph.nodes)[0]
    if 'tensor_meta' not in check_node.meta:
        return balanced_split_pass(gm, pp_size)

    total_element_size = 0
    for node in mod_graph.nodes:
        total_element_size += node.node_size

    partition_size = total_element_size // pp_size
    accumulate_node_size = 0
    for node in mod_graph.nodes:
        if pp_size <= 1:
            break
        if 'pipe_split' in node.name:
            continue
        accumulate_node_size += node.node_size
        if accumulate_node_size >= partition_size:
            total_element_size = total_element_size - accumulate_node_size
            accumulate_node_size = 0
            pp_size -= 1
            partition_size = total_element_size // pp_size
            with mod_graph.inserting_after(node):
                split_node = mod_graph.create_node('call_function', pipe_split)
    gm.recompile()
    return gm


def uniform_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    mod_graph = gm.graph
    valid_children_size = 0
    valid_children = []
    for module in mod_graph.owning_module.children():
        valid_children_size += 1
        valid_children.append(module)

    if valid_children_size < pp_size:
        # If valid children is not enough to shard, we will use balanced policy instead of uniform policy.
        return balanced_split_pass(gm, pp_size)
    layers_per_partition = valid_children_size // pp_size
    accumulate_layer_amount = 0
    for node in mod_graph.nodes:
        if pp_size <= 1:
            break
        if node.op == "call_module":
            target_module = node.graph.owning_module.get_submodule(node.target)
            if target_module in valid_children:
                accumulate_layer_amount += 1
        if accumulate_layer_amount == layers_per_partition:
            accumulate_layer_amount = 0
            pp_size -= 1
            with mod_graph.inserting_after(node):
                split_node = mod_graph.create_node('call_function', pipe_split)
    gm.recompile()
    return gm


def split_with_split_nodes_pass(annotated_gm: torch.fx.GraphModule, merge_output=False):
    # TODO(lyl): use partition IR to assign partition ID to each node.
    # Currently: analyzing graph -> annotate graph by inserting split node -> use split module pass to split graph
    # In future: graph to partitions -> analyzing partition IR -> recombining partitions to get best performance -> assign partition ID to each node
    part_idx = 0

    def split_callback(n: torch.fx.Node):
        nonlocal part_idx
        if (n.op, n.target) == ('call_function', pipe_split):
            part_idx += 1
        return part_idx

    split_mod = split_module(annotated_gm, None, split_callback, merge_output)
    split_submodules = []
    for name, submodule in split_mod.named_modules():
        if isinstance(submodule, torch.fx.GraphModule):
            for node in submodule.graph.nodes:
                if (node.op, node.target) == ('call_function', pipe_split):
                    submodule.graph.erase_node(node)
            submodule.recompile()
            split_submodules.append(submodule)

    return split_mod, split_submodules
[pipeline] Add Simplified Alpa DP Partition (#2507) * add alpa dp split * add alpa dp split * use fwd+bwd instead of fwd only --------- Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2023-03-07 02:34:31 +00:00			`import numpy as np`
[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`import torch`
[pipeline] Add Simplified Alpa DP Partition (#2507) * add alpa dp split * add alpa dp split * use fwd+bwd instead of fwd only --------- Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2023-03-07 02:34:31 +00:00			`import tqdm`
[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`from torch.fx import symbolic_trace`
			`from torch.fx.node import Node`
[fx] support module with bias addition (#1780) * [autoparallel] refactor tracer to fix bias addition issue * [fx] support module with bias addition * create bias_addition_module * refactor file structure * polish code * fix unit test 2022-11-01 14:53:51 +00:00
[fx]add split module pass and unit test from pipeline passes (#1242) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx]add split module pass and unit test from pipeline passes * fix MNASNet bug * polish 2022-07-12 05:45:01 +00:00			`from colossalai.fx.passes.split_module import split_module`
[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00

			`def pipe_split():`
			`pass`


[pipeline] Add Simplified Alpa DP Partition (#2507) * add alpa dp split * add alpa dp split * use fwd+bwd instead of fwd only --------- Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2023-03-07 02:34:31 +00:00			`def block_split():`
			`pass`


			`# Construct blocks with the condition that (block_flops / total_flops) >= limit.`
			`def construct_blocks(gm: torch.fx.GraphModule, limit=0.01):`
			`total_fwd_flop = 0`
			`total_bwd_flop = 0`
			`for node in gm.graph.nodes:`
			`total_fwd_flop += node.fwd_flop`
			`total_bwd_flop += node.bwd_flop`

			`total_flop = total_fwd_flop + total_bwd_flop`
			`per_block_flop = total_flop * limit`
			`accumulate_fwd_flop = 0`
			`accumulate_bwd_flop = 0`
			`block_nodes = []`
			`for node in gm.graph.nodes:`
			`if 'block_split' in node.name:`
			`continue`
			`accumulate_fwd_flop += node.fwd_flop`
			`accumulate_bwd_flop += node.bwd_flop`
			`if accumulate_fwd_flop + accumulate_bwd_flop >= per_block_flop:`
			`with gm.graph.inserting_after(node):`
			`block_node = gm.graph.create_node('call_function', block_split)`
			`setattr(block_node, 'fwd_flop', accumulate_fwd_flop)`
			`setattr(block_node, 'bwd_flop', accumulate_bwd_flop)`
			`accumulate_fwd_flop = 0`
			`accumulate_bwd_flop = 0`
			`block_nodes.append(block_node)`

			`return block_nodes`


			`def remove_blocks(gm: torch.fx.GraphModule):`
			`for node in gm.graph.nodes:`
			`if (node.op, node.target) == ('call_function', block_split):`
			`gm.graph.erase_node(node)`


			`def get_compute_costs(node_list):`
			`num_nodes = len(node_list)`
			`all_compute_cost = np.full((num_nodes, num_nodes), np.inf, dtype=np.float64)`

			`for start in tqdm.tqdm(range(num_nodes), desc='start pos', position=0):`
			`for end in tqdm.tqdm(range(start, num_nodes), desc='end pos', position=1, leave=False):`
			`selected_flops = [(node_list[i].fwd_flop + node_list[i].bwd_flop) for i in range(start, end + 1)]`
			`all_compute_cost[start, end] = sum(selected_flops)`

			`return all_compute_cost`


			`def do_dp_split_gpipe_impl(num_nodes, num_stages, num_microbatches, compute_costs, max_compute_cost):`
			`"""The core implementation of the DP algorithm."""`
			`# Adapted from Alpa DP Formulation.`
			`# For f, node ID start from 0`
			`# f[number of stages,`
			`# node id that is currently being considered]`

			`# record time cost(assess by fwd+bwd flop now)`
			`f = np.full((num_stages + 1, num_nodes + 1), np.inf, dtype=np.float32)`

			`# record max stage compute cost among all stages in this partition.`
			`f_stage_max = np.full((num_stages + 1, num_nodes + 1), 0.0, dtype=np.float32)`
			`# record start node index for next stage in this partition`
			`f_argmin = np.full((num_stages + 1, num_nodes + 1), -1, dtype=np.int32)`
			`f[0, num_nodes] = 0`
			`for s in tqdm.tqdm(range(1, num_stages + 1), desc='stage', position=2, leave=False): # pylint: disable=too-many-nested-blocks`
			`for i in tqdm.tqdm(range(num_nodes - 1, -1, -1), desc='start node', position=3, leave=False):`
			`for k in tqdm.tqdm(range(num_nodes, i, -1), desc='mid node', position=4, leave=False):`
			`stage_cost = compute_costs[i, k - 1]`
			`new_cost = f[s - 1, k] + stage_cost`
			`if (stage_cost <= max_compute_cost and new_cost < f[s, i]):`
			`f[s, i] = new_cost`
			`f_stage_max[s, i] = max(f_stage_max[s - 1, k], stage_cost)`
			`f_argmin[s, i] = k`

			`best_total_cost = f[num_stages, 0]`
			`if np.isinf(best_total_cost):`
			`return np.inf, None`

			`total_cost = f[num_stages, 0] + (num_microbatches - 1) * f_stage_max[num_stages, 0]`

			`current_s = num_stages`
			`current_node = 0`

			`res = []`
			`while current_s > 0 and current_node < num_nodes:`
			`next_start_node = f_argmin[current_s, current_node]`
			`res.append((current_node, next_start_node))`
			`current_s -= 1`
			`current_node = next_start_node`

			`return total_cost, res`


			`def do_dp_split_gpipe(node_list, compute_costs, num_stages: int, num_microbatches: int):`
			`# Ignore the memory cost profiling in Alpa's design for convenience.`
			`max_compute_costs = np.sort(np.unique(compute_costs))`
			`best_cost = np.inf`
			`best_solution = None`
			`last_max_compute_cost = 0.0`
			`gap = 1e6 # temporary magic number, unit: flops`

			`for max_compute_cost in tqdm.tqdm(max_compute_costs):`
			`# Pruning to reduce search space.`
			`if max_compute_cost * num_microbatches >= best_cost:`
			`break`
			`if max_compute_cost - last_max_compute_cost < gap:`
			`continue`

			`cost, solution = do_dp_split_gpipe_impl(len(node_list), num_stages, num_microbatches, compute_costs,`
			`max_compute_cost)`

			`if cost < best_cost:`
			`best_cost = cost`
			`best_solution = solution`
			`last_max_compute_cost = max_compute_cost`
			`return best_cost, best_solution`


			`# Auto DP partition based on Alpa.`
			`# Adapted to Gpipe Scheduler`
			`# split_mode:`
			`# 'node': fx_node`
			`# 'block': many fx_nodes construct a block`
			`def gpipe_dp_split_pass(gm: torch.fx.GraphModule, pp_size: int, num_microbatches: int, mode='block', block_limit=0.01):`
			`assert mode in ['node', 'block']`

			`# nodes or blocks will be used in partition.`
			`node_list = []`
			`if mode == 'node':`
			`for node in gm.graph.nodes:`
			`node_list.append(node)`
			`elif mode == 'block':`
			`node_list = construct_blocks(gm, limit=block_limit)`
			`else:`
			`pass`

			`compute_costs = get_compute_costs(node_list)`

			`best_cost, best_solution = do_dp_split_gpipe(node_list, compute_costs, pp_size, num_microbatches)`

			`for (_, next_start_node) in best_solution:`
			`if pp_size <= 1:`
			`break`
			`node = node_list[next_start_node]`
			`with gm.graph.inserting_before(node):`
			`split_node = gm.graph.create_node('call_function', pipe_split)`
			`pp_size -= 1`

			`# remove block node if possible`
			`if mode == 'block':`
			`remove_blocks(gm)`

			`gm.recompile()`
			`return gm`


add avg partition (#2483) Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2023-01-19 05:54:50 +00:00			`def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):`
			`"""`
			`In avgcompute_split_pass, we split module by the fwd flops.`
			`"""`
			`mod_graph = gm.graph`
			`# To use avgcompute_split_pass, we need run meta_info_prop interpreter first.`
			`# If nodes don't have meta info, this pass will fall back to normal balanced split pass.`
			`check_node = list(mod_graph.nodes)[0]`
			`if 'tensor_meta' not in check_node.meta:`
			`return balanced_split_pass(gm, pp_size)`

			`total_fwd_flop = 0`
			`for node in mod_graph.nodes:`
			`total_fwd_flop += node.fwd_flop`

			`partition_flop = total_fwd_flop // pp_size`
			`accumulate_fwd_flop = 0`
			`for node in mod_graph.nodes:`
			`if pp_size <= 1:`
			`break`
			`if 'pipe_split' in node.name:`
			`continue`
			`accumulate_fwd_flop += node.fwd_flop`
			`if accumulate_fwd_flop >= partition_flop:`
			`total_fwd_flop = total_fwd_flop - accumulate_fwd_flop`
			`accumulate_fwd_flop = 0`
			`pp_size -= 1`
			`partition_flop = total_fwd_flop // pp_size`
			`with mod_graph.inserting_after(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
			`gm.recompile()`
			`return gm`


[Pipeline Middleware ] Fix deadlock when num_microbatch=num_stage (#2156) * add splitter * polish code * remove comment * fix async nan by moving to cpu first Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2022-12-23 03:38:43 +00:00			`def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):`
			`"""`
fix typo colossalai/auto_parallel autochunk fx/passes etc. (#3808) 2023-05-24 01:01:50 +00:00			`In avgnode_split_pass, simply split graph by node number.`
[Pipeline Middleware ] Fix deadlock when num_microbatch=num_stage (#2156) * add splitter * polish code * remove comment * fix async nan by moving to cpu first Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2022-12-23 03:38:43 +00:00			`"""`
			`mod_graph = gm.graph`
			`avg_num_node = len(mod_graph.nodes) // pp_size`
			`accumulate_num_node = 0`
			`for node in mod_graph.nodes:`
			`if pp_size <= 1:`
			`break`
			`accumulate_num_node += 1`
			`if accumulate_num_node >= avg_num_node:`
			`accumulate_num_node = 0`
			`pp_size -= 1`
			`if node.next.op == 'output':`
			`with mod_graph.inserting_before(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
			`else:`
			`with mod_graph.inserting_after(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
			`gm.recompile()`
			`return gm`


[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):`
[fx] add balanced policy v2 (#1251) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx] add balanced policy v2 * add unittest 2022-07-15 06:54:26 +00:00			`"""`
			`In balanced_split_pass, we split module by the size of parameters(weights+bias).`
			`"""`
[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`mod_graph = gm.graph`
			`total_param_amount = 0`
			`for param in mod_graph.owning_module.parameters():`
			`total_param_amount += param.numel()`
			`params_per_partition = total_param_amount // pp_size`
			`accumulate_param_amount = 0`
			`for node in mod_graph.nodes:`
			`if pp_size <= 1:`
			`break`
			`if node.op == "call_module":`
			`target_module = node.graph.owning_module.get_submodule(node.target)`
			`for param in target_module.parameters():`
			`accumulate_param_amount += param.numel()`
			`if accumulate_param_amount >= params_per_partition:`
			`accumulate_param_amount = 0`
			`pp_size -= 1`
[fx]add split module pass and unit test from pipeline passes (#1242) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx]add split module pass and unit test from pipeline passes * fix MNASNet bug * polish 2022-07-12 05:45:01 +00:00			`# If the next node is output node, we will insert split annotation before`
			`# node to make sure there is at least one node in last partition.`
			`if node.next.op == 'output':`
			`with mod_graph.inserting_before(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
			`else:`
			`with mod_graph.inserting_after(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
[fx] support module with bias addition (#1780) * [autoparallel] refactor tracer to fix bias addition issue * [fx] support module with bias addition * create bias_addition_module * refactor file structure * polish code * fix unit test 2022-11-01 14:53:51 +00:00			`if pp_size > 1:`
			`node_counter = 0`
			`for node in mod_graph.nodes:`
			`if pp_size <= 1:`
			`break`
			`if node.op == 'placeholder':`
			`continue`
			`elif node_counter == 0:`
			`node_counter += 1`
			`else:`
			`pp_size -= 1`
			`node_counter = 0`
			`with mod_graph.inserting_before(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`

[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`gm.recompile()`
			`return gm`


[fx] add balanced policy v2 (#1251) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx] add balanced policy v2 * add unittest 2022-07-15 06:54:26 +00:00			`def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):`
			`"""`
			`In balanced_split_pass_v12, we split module by the size of nodes(weights+bias+outputs).`
			`"""`
			`mod_graph = gm.graph`
			`# To use balanced_split_pass_v2, we need run meta_info_prop interpreter first.`
			`# If nodes don't have meta info, this pass will fall back to normal balanced split pass.`
			`check_node = list(mod_graph.nodes)[0]`
			`if 'tensor_meta' not in check_node.meta:`
			`return balanced_split_pass(gm, pp_size)`

			`total_element_size = 0`
			`for node in mod_graph.nodes:`
			`total_element_size += node.node_size`

			`partition_size = total_element_size // pp_size`
			`accumulate_node_size = 0`
			`for node in mod_graph.nodes:`
			`if pp_size <= 1:`
			`break`
[fx] update split module pass and add customized policy (#1373) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx]update split module pass and add customized policy 2022-07-27 05:40:54 +00:00			`if 'pipe_split' in node.name:`
			`continue`
[fx] add balanced policy v2 (#1251) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx] add balanced policy v2 * add unittest 2022-07-15 06:54:26 +00:00			`accumulate_node_size += node.node_size`
			`if accumulate_node_size >= partition_size:`
add avg partition (#2483) Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2023-01-19 05:54:50 +00:00			`total_element_size = total_element_size - accumulate_node_size`
[fx] add balanced policy v2 (#1251) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx] add balanced policy v2 * add unittest 2022-07-15 06:54:26 +00:00			`accumulate_node_size = 0`
			`pp_size -= 1`
add avg partition (#2483) Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2023-01-19 05:54:50 +00:00			`partition_size = total_element_size // pp_size`
[fx] add balanced policy v2 (#1251) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx] add balanced policy v2 * add unittest 2022-07-15 06:54:26 +00:00			`with mod_graph.inserting_after(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
			`gm.recompile()`
			`return gm`


[fx]add uniform policy (#1208) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * [fx]add uniform policy 2022-07-06 05:48:11 +00:00			`def uniform_split_pass(gm: torch.fx.GraphModule, pp_size: int):`
			`mod_graph = gm.graph`
			`valid_children_size = 0`
			`valid_children = []`
			`for module in mod_graph.owning_module.children():`
			`valid_children_size += 1`
			`valid_children.append(module)`

			`if valid_children_size < pp_size:`
			`# If valid children is not enough to shard, we will use balanced policy instead of uniform policy.`
			`return balanced_split_pass(gm, pp_size)`
			`layers_per_partition = valid_children_size // pp_size`
			`accumulate_layer_amount = 0`
			`for node in mod_graph.nodes:`
			`if pp_size <= 1:`
			`break`
			`if node.op == "call_module":`
			`target_module = node.graph.owning_module.get_submodule(node.target)`
			`if target_module in valid_children:`
			`accumulate_layer_amount += 1`
			`if accumulate_layer_amount == layers_per_partition:`
			`accumulate_layer_amount = 0`
			`pp_size -= 1`
			`with mod_graph.inserting_after(node):`
			`split_node = mod_graph.create_node('call_function', pipe_split)`
			`gm.recompile()`
			`return gm`


[rpc] split with dag (#2028) * add DAG to split_module * add comment * add test case for DAG * remove print * add DAG middleware in scheduler * add test case for scheduler * remove break * recover old lifecycle Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2022-11-29 03:36:28 +00:00			`def split_with_split_nodes_pass(annotated_gm: torch.fx.GraphModule, merge_output=False):`
[fx] methods to get fx graph property. (#1246) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * manipulation * [fx]add graph manipulation methods. * [fx]methods to get fx graph property. * add unit test * add docstring to explain top node and leaf node in this context 2022-07-12 06:10:37 +00:00			`# TODO(lyl): use partition IR to assign partition ID to each node.`
			`# Currently: analyzing graph -> annotate graph by inserting split node -> use split module pass to split graph`
			`# In future: graph to partitions -> analyzing partition IR -> recombining partitions to get best performance -> assign partition ID to each node`
[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`part_idx = 0`

			`def split_callback(n: torch.fx.Node):`
			`nonlocal part_idx`
			`if (n.op, n.target) == ('call_function', pipe_split):`
			`part_idx += 1`
			`return part_idx`

[rpc] split with dag (#2028) * add DAG to split_module * add comment * add test case for DAG * remove print * add DAG middleware in scheduler * add test case for scheduler * remove break * recover old lifecycle Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com> 2022-11-29 03:36:28 +00:00			`split_mod = split_module(annotated_gm, None, split_callback, merge_output)`
[fx]add autoparallel passes (#1121) * [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4500af6a9220ef7fe4d3c7b1daebd4c. * feature/add autoparallel passes 2022-06-15 08:36:46 +00:00			`split_submodules = []`
			`for name, submodule in split_mod.named_modules():`
			`if isinstance(submodule, torch.fx.GraphModule):`
			`for node in submodule.graph.nodes:`
			`if (node.op, node.target) == ('call_function', pipe_split):`
			`submodule.graph.erase_node(node)`
			`submodule.recompile()`
			`split_submodules.append(submodule)`

			`return split_mod, split_submodules`