ColossalAI/colossalai/auto_parallel/offload/util.py

from dataclasses import dataclass
from typing import List

import torch

from colossalai.context.singleton_meta import SingletonMeta
from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp

from .region import Region


@dataclass
class NodeInfo:
    node_id: int = 0
    runtime_fwd_mem: float = 0
    runtime_bwd_mem: float = 0


class NvDevicePower:
    """
    NVIDIA GPU computing performance (TFLOPs).
    """

    RTX3080_FP16 = 70
    RTX3080_FP32 = 34.1

    RTX3090_FP16 = 71
    RTX3090_FP32 = 35.7

    V100_FP16 = 31.4
    V100_FP32 = 15.7

    A100_FP16 = 78
    A100_FP32 = 19.5


class GlobalRuntimeInfo(metaclass=SingletonMeta):
    def __init__(self):
        self.h2d_stream = torch.cuda.Stream()
        self.d2h_stream = torch.cuda.Stream()
        self.fwd_prefetch_event_map = {}
        self.bwd_prefetch_event_map = {}
        self.region_list = []


def compute_act_peak_mem(region_list: List[Region]) -> float:
    act_peak_mem = 0
    runtime_mem = 0
    # forward
    for region in region_list:
        for node in region.nodes:
            runtime_mem = runtime_mem + calculate_fwd_tmp(node) + calculate_fwd_out(node)
            act_peak_mem = max(runtime_mem, act_peak_mem)
    # backward
    bwd_deps = {}
    for region in region_list.__reversed__():
        for node in region.nodes.__reversed__():
            runtime_mem -= calculate_fwd_out(node)
            runtime_mem = runtime_mem + node.meta["bwd_mem_tmp"] + node.meta["bwd_mem_out"]

            act_peak_mem = max(runtime_mem, act_peak_mem)

            runtime_mem = runtime_mem - node.meta["bwd_mem_tmp"] - calculate_fwd_tmp(node)

            # free bwd_mem_out
            bwd_deps[node] = len(node.all_input_nodes)
            for user_node in node.users:
                if user_node in bwd_deps:
                    bwd_deps[user_node] -= 1
                    if bwd_deps[user_node] <= 0:
                        runtime_mem -= user_node.meta["bwd_mem_out"]

    return act_peak_mem


def compute_max_param_mem(region_list: List[Region]) -> float:
    return max(region.param_size for region in region_list)


def compute_total_param_mem(region_list: List[Region]) -> float:
    return sum(region.param_size for region in region_list if region.r_id <= region.shared_rid)


def requires_upload_p_in_fwd(shared_reg: Region):
    return (shared_reg.r_id >= shared_reg.shared_rid) or (
        shared_reg.r_id < shared_reg.shared_rid and shared_reg.need_offload
    )


def requires_release_p_in_bwd(shared_reg: Region):
    return (shared_reg.r_id >= shared_reg.shared_rid) or (
        shared_reg.r_id < shared_reg.shared_rid and shared_reg.need_offload
    )


def requires_offload_g_in_bwd(region: Region):
    return region.param_size and (region.r_id <= region.shared_rid)
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`from dataclasses import dataclass`
			`from typing import List`
[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`import torch`
[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
			`from colossalai.context.singleton_meta import SingletonMeta`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`from colossalai.fx.profiler import calculate_fwd_out, calculate_fwd_tmp`

			`from .region import Region`


			`@dataclass`
			`class NodeInfo:`
			`node_id: int = 0`
			`runtime_fwd_mem: float = 0`
			`runtime_bwd_mem: float = 0`

[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`class NvDevicePower:`
			`"""`
			`NVIDIA GPU computing performance (TFLOPs).`
			`"""`

			`RTX3080_FP16 = 70`
			`RTX3080_FP32 = 34.1`

			`RTX3090_FP16 = 71`
			`RTX3090_FP32 = 35.7`

			`V100_FP16 = 31.4`
			`V100_FP32 = 15.7`

			`A100_FP16 = 78`
			`A100_FP32 = 19.5`


[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago			`class GlobalRuntimeInfo(metaclass=SingletonMeta):`
			`def __init__(self):`
			`self.h2d_stream = torch.cuda.Stream()`
			`self.d2h_stream = torch.cuda.Stream()`
			`self.fwd_prefetch_event_map = {}`
			`self.bwd_prefetch_event_map = {}`
			`self.region_list = []`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago

			`def compute_act_peak_mem(region_list: List[Region]) -> float:`
			`act_peak_mem = 0`
			`runtime_mem = 0`
			`# forward`
			`for region in region_list:`
			`for node in region.nodes:`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`runtime_mem = runtime_mem + calculate_fwd_tmp(node) + calculate_fwd_out(node)`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`act_peak_mem = max(runtime_mem, act_peak_mem)`
			`# backward`
			`bwd_deps = {}`
			`for region in region_list.__reversed__():`
			`for node in region.nodes.__reversed__():`
			`runtime_mem -= calculate_fwd_out(node)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`runtime_mem = runtime_mem + node.meta["bwd_mem_tmp"] + node.meta["bwd_mem_out"]`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`act_peak_mem = max(runtime_mem, act_peak_mem)`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`runtime_mem = runtime_mem - node.meta["bwd_mem_tmp"] - calculate_fwd_tmp(node)`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`# free bwd_mem_out`
			`bwd_deps[node] = len(node.all_input_nodes)`
			`for user_node in node.users:`
			`if user_node in bwd_deps:`
			`bwd_deps[user_node] -= 1`
			`if bwd_deps[user_node] <= 0:`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`runtime_mem -= user_node.meta["bwd_mem_out"]`
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`return act_peak_mem`

[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`def compute_max_param_mem(region_list: List[Region]) -> float:`
			`return max(region.param_size for region in region_list)`

[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`def compute_total_param_mem(region_list: List[Region]) -> float:`
			`return sum(region.param_size for region in region_list if region.r_id <= region.shared_rid)`

[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago			`def requires_upload_p_in_fwd(shared_reg: Region):`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`return (shared_reg.r_id >= shared_reg.shared_rid) or (`
			`shared_reg.r_id < shared_reg.shared_rid and shared_reg.need_offload`
			`)`
[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`def requires_release_p_in_bwd(shared_reg: Region):`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`return (shared_reg.r_id >= shared_reg.shared_rid) or (`
			`shared_reg.r_id < shared_reg.shared_rid and shared_reg.need_offload`
			`)`
[test] fixed gemini plugin test (#3411) * [test] fixed gemini plugin test * polish code * polish code 2 years ago
[auto-parallel] add auto-offload feature (#3154) * add auto-offload feature * polish code * fix syn offload runtime pass bug * add offload example * fix offload testing bug * fix example testing bug 2 years ago
			`def requires_offload_g_in_bwd(region: Region):`
			`return region.param_size and (region.r_id <= region.shared_rid)`