add avg partition (#2483)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
2023-01-19 13:54:50 +08:00 · 2023-01-19 13:54:50 +08:00 · 0f02b8c6e6
parent 99d9713b02
commit 0f02b8c6e6
2 changed files with 38 additions and 1 deletions
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@ -9,6 +9,40 @@ def pipe_split():
    pass
 def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    """
    In avgcompute_split_pass, we split module by the fwd flops.
    """
    mod_graph = gm.graph
    # To use avgcompute_split_pass, we need run meta_info_prop interpreter first.
    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
    check_node = list(mod_graph.nodes)[0]
    if 'tensor_meta' not in check_node.meta:
        return balanced_split_pass(gm, pp_size)
    total_fwd_flop = 0
    for node in mod_graph.nodes:
        total_fwd_flop += node.fwd_flop
    partition_flop = total_fwd_flop // pp_size
    accumulate_fwd_flop = 0
    for node in mod_graph.nodes:
        if pp_size <= 1:
            break
        if 'pipe_split' in node.name:
            continue
        accumulate_fwd_flop += node.fwd_flop
        if accumulate_fwd_flop >= partition_flop:
            total_fwd_flop = total_fwd_flop - accumulate_fwd_flop
            accumulate_fwd_flop = 0
            pp_size -= 1
            partition_flop = total_fwd_flop // pp_size
            with mod_graph.inserting_after(node):
                split_node = mod_graph.create_node('call_function', pipe_split)
    gm.recompile()
    return gm
 def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    """
    In avgnode_split_pass, simpliy split graph by node number.
@ -104,8 +138,10 @@ def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
            continue
        accumulate_node_size += node.node_size
        if accumulate_node_size >= partition_size:
            total_element_size = total_element_size - accumulate_node_size
            accumulate_node_size = 0
            pp_size -= 1
            partition_size = total_element_size // pp_size
            with mod_graph.inserting_after(node):
                split_node = mod_graph.create_node('call_function', pipe_split)
    gm.recompile()
--- a/colossalai/fx/passes/meta_info_prop.py
+++ b/colossalai/fx/passes/meta_info_prop.py
@ -112,7 +112,8 @@ class MetaInfoProp(torch.fx.Interpreter):
        n.meta['tensor_meta'] = tensor_meta
        n.meta = {**n.meta, **asdict(meta_info)}    # extend MetaInfo to `n.meta`
        # TODO: the attribute node_size should be removed in the future
-        setattr(n, 'node_size', activation_size(n.meta.get('fwd_in', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
+        setattr(n, 'node_size', activation_size(n.meta.get('fwd_out', 0)) + activation_size(n.meta.get('fwd_tmp', 0)))
        setattr(n, 'fwd_flop', n.meta.get('fwd_flop', 0))
        n.meta['type'] = type(result)
        # retain the autograd graph