ColossalAI/colossalai/auto_parallel/offload/solver.py

import time
from abc import ABC, abstractmethod
from typing import Dict, List, Type

NOT_NVML = False
try:
    from pynvml import *
except:
    NOT_NVML = True

import torch
from torch.fx.node import Node

from colossalai.accelerator import get_accelerator

from .region import Region
from .training_simulator import AsynTrainingSimulator, SynTrainingSimulator, TrainingSimulator
from .util import NodeInfo, NvDevicePower


def benchmark_func(func, number=1, repeat=1, warmup=3):
    """
    benchmark data transfer cost.
    """

    for i in range(warmup):
        func()

    costs = []

    for i in range(repeat):
        torch.cuda.synchronize()
        begin = time.time()
        for i in range(number):
            func()
        torch.cuda.synchronize()
        costs.append((time.time() - begin) / number)

    return sum(costs) / len(costs)


class Solver(ABC):
    """
    The parameter offload solver.

    Args:
        region_list (List[Region]): represents the linearized DNN computing graph.
        memory_budget (float): the given memory budget.
        error_factor (float): the error factor.
            It is used to reduce the memory budget. Due to some errors in the estimation of peak memory and execution time.
    """

    def __init__(self, region_list: List[Region], memory_budget: float = -1.0, error_factor: float = 0.95) -> None:
        self.region_list = region_list

        self.error_factor: float = error_factor
        if memory_budget > 0:
            self.memory_budget = memory_budget * self.error_factor
        else:
            self.memory_budget = (
                torch.cuda.get_device_properties(get_accelerator().get_current_device()).total_memory
                * self.error_factor
            )

        self.link_to_bandwidth: Dict[str, Dict[float, float]] = self._profile_bandwidth()
        self.comp_power: float = self._extract_computing_power()

    @abstractmethod
    def _call_solver(self):
        raise NotImplementedError

    @abstractmethod
    def _try_to_offload(self, *args):
        raise NotImplementedError

    @abstractmethod
    def _eval_one_choice(self, *args):
        raise NotImplementedError

    def _compute_offload_profit(self, total_mem_saving: float, peak_mem_saving: float, extra_cost: float):
        """
        Compute the profits of the offload strategies,
        which packages the memory savings information for subsequent comparisons.

        Args:
            total_mem_saving (float): the total memory saving of the offload strategy.
            peak_mem_saving (float): the peak memory saving of the offload strategy.
            extra_cost (float): extra data transfer cost.

        Returns:
            tuple: profit information, the first term represents memory savings per unit of time.
        """

        if extra_cost == 0:
            # means data transfer overhead can be completely overlapped
            return (float("inf"), total_mem_saving, peak_mem_saving)
        return (total_mem_saving / extra_cost, total_mem_saving, peak_mem_saving)

    def _compare_profit(self, profit_a: tuple, profit_b: tuple) -> bool:
        """
        Compare the profits of the two offload strategies using the dictionary order algorithm.

        Args:
            profit_a (tuple): the profit of a offload strategy.
            profit_b (tuple): the profit of another offload strategy.

        Returns:
            bool: whether profit_a is greater than profit_b.
        """

        for val1, val2 in zip(profit_a, profit_b):
            if val1 != val2:
                return val1 > val2
        return False

    def _update_state(self, best_ts: TrainingSimulator):
        """
        Update the solver state.
        """

        self.best_ts = best_ts
        self._update_node_mem_info(best_ts.fwd_node_mem, best_ts.bwd_node_mem)

    def _update_node_mem_info(self, fwd_mem_info: Dict[Node, float], bwd_mem_info: Dict[Node, float]):
        """
        Update the runtime memory information of the node.

        Args:
            fwd_mem_info (Dict[Node, float]): the runtime memory of each node in forward pass.
            bwd_mem_info (Dict[Node, float]): the runtime memory of each node in backward pass.
        """

        for node, mem in fwd_mem_info.items():
            assert hasattr(node, "node_info") and isinstance(node.node_info, NodeInfo)
            node.node_info.runtime_fwd_mem = mem
        for node, mem in bwd_mem_info.items():
            assert hasattr(node, "node_info") and isinstance(node.node_info, NodeInfo)
            node.node_info.runtime_bwd_mem = mem

    def _extract_computing_power(self):
        """
        return the FP16 computing performance of the current NVIDIA GPU.

        Raises:
            TypeError: Unknown NVIDIA GPU device.
        """

        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        device_name = nvmlDeviceGetName(handle)
        units = 1e12

        if device_name.__contains__("RTX 3080"):
            return NvDevicePower.RTX3080_FP16 * units
        elif device_name.__contains__("RTX 3090"):
            return NvDevicePower.RTX3090_FP16 * units
        elif device_name.__contains__("V100"):
            return NvDevicePower.V100_FP16 * units
        elif device_name.__contains__("A100"):
            return NvDevicePower.A100_FP16 * units
        else:
            raise TypeError(f"Unknown NVIDIA GPU device name {device_name}")

    def _profile_bandwidth(self):
        """
        Profile the bidirectional communication bandwidth between CPU and GPU
        using data volumes ranging from 1KB to 1GB.
        """

        print("profiling bandwidth ......")
        link_to_bandwidth = {}
        links = ["h2d", "d2h"]

        for link in links:
            t_size = 1024
            size_to_bandwidth = {}

            # from 1KB to 1GB
            for i in range(21):
                if link == "h2d":
                    src_tensor = torch.ones(int(t_size), dtype=torch.int8, pin_memory=True)
                    dst_tensor = torch.ones((int(t_size)), dtype=torch.int8, device="cuda")
                elif link == "d2h":
                    src_tensor = torch.ones(int(t_size), dtype=torch.int8, device="cuda")
                    dst_tensor = torch.ones((int(t_size)), dtype=torch.int8, pin_memory=True)

                def func():
                    dst_tensor.copy_(src_tensor)

                size_to_bandwidth[t_size] = t_size / benchmark_func(func, number=5, repeat=3)
                print(
                    f"size: {t_size / 1024 ** 2:.3f} MB, "
                    f"{src_tensor.device.type}-to-{dst_tensor.device.type} "
                    f"bandwidth: {size_to_bandwidth[t_size] / 1024 ** 3:.3f} GB/s"
                )

                t_size *= 2

            link_to_bandwidth[link] = size_to_bandwidth
        return link_to_bandwidth


class SynGreedySolver(Solver):
    def __init__(self, region_list: List[Region], memory_budget: float = -1.0) -> None:
        super().__init__(region_list, memory_budget)

        self.best_ts: SynTrainingSimulator = None
        self._init_state()

    def _init_state(self):
        """
        Initialize the solver state when without offloading.
        """

        ts = SynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
        ts.execute()
        self._update_state(ts)

    def _call_solver(self):
        """
        Call the solver to search an efficient parameter offloading strategy for the linearized graph.
        The solver adopts greedy algorithm.

        Raises:
            NotImplementedError: Unable to find a solution for the given memory budget.
        """

        print("search offloading strategy ......")
        while self.best_ts.peak_mem > self.memory_budget:
            offload_region = None
            best_ts = None
            max_profit = (0,)

            # search which region should be offloaded,
            # the last region does not need to be offloaded.
            for region in self.region_list[:-1]:
                if region.param_size and not region.need_offload:
                    temp_ts, profit = self._try_to_offload(region)
                    if self._compare_profit(profit, max_profit):
                        offload_region = region
                        max_profit = profit
                        best_ts = temp_ts

            if offload_region is not None and best_ts is not None:
                offload_region.need_offload = True
                offload_region.is_syn = True
                self._update_state(best_ts)
            else:
                raise NotImplementedError(
                    f"can't find the offload strategy met the memory budget {self.memory_budget / 1024 ** 2} MB, "
                    f"it needs {self.best_ts.peak_mem / 1024 ** 2:.3f} MB at least!"
                )

    def _call_solver_l2l(self):
        """
        The layer-wise offload strategy.
        """

        for region in self.region_list[:-1]:
            region.need_offload = True
            region.is_syn = True

    def _try_to_offload(self, offload_region: Region):
        # record previous information
        orig_need_offload = offload_region.need_offload
        assert not orig_need_offload
        offload_region.need_offload = True

        ts, profit = self._eval_one_choice(offload_region)

        # restore previous information
        offload_region.need_offload = orig_need_offload
        return ts, profit

    def _eval_one_choice(self, offload_region: Region):
        """
        Evaluate the profit of a strategy choice.

        Args:
            offload_region (Region): the offload region of current choice.

        Returns:
            SynTrainingSimulator: the training simulator corresponding to the current strategy.
            tuple: contains memory saving and cost information of the current strategy.
        """

        ts = SynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
        ts.execute()

        extra_comm_cost = 2.0 * ts._get_communication_overhead("h2d", offload_region.param_size)
        # the shared region needs to be moved twice
        if offload_region.r_id < offload_region.shared_rid:
            extra_comm_cost *= 2.0
        profit = self._compute_offload_profit(ts.total_mem_saving, self.best_ts.peak_mem - ts.peak_mem, extra_comm_cost)

        return ts, profit


class AsynGreedySolver(Solver):
    def __init__(self, region_list: List[Region], memory_budget: float = -1.0, search_window_size: int = 3):
        super().__init__(region_list, memory_budget)

        self.search_window_size = search_window_size
        # Records the prefetch execution location of the offloaded region
        self.region_to_region_map = {}
        self.best_ts: AsynTrainingSimulator = None

        self._init_state()

    def _init_state(self):
        """
        Initialize the solver state when without offloading.
        """

        ts = AsynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
        ts.execute()
        self._update_state(ts)
        print("init peak memory", self.best_ts.peak_mem / 1024**2, "MB")

    def _call_solver(self):
        """
        Call the solver to search an efficient parameter offloading strategy for the linearized graph.
        The solver adopts greedy algorithm.

        Raises:
            NotImplementedError: Unable to find a solution for the given memory budget.
        """

        print("search for offloading strategy ......")
        # Records the prefetch execution location of the offloaded region
        region_to_region_map = {}
        while self.best_ts.peak_mem > self.memory_budget:
            region_to_offload = None
            max_offload_profit = (0,)
            best_offl_ts = None

            # search which region should be offloaded,
            # the last region does not need to be offloaded
            for region in self.region_list[:-1]:
                if region.param_size and not region.need_offload:
                    max_prefetch_profit = (0,)
                    best_pref_ts = None

                    # search when to prefetch the region offloaded
                    for host_region in self.region_list[region.r_id + 1 : region.r_id + 1 + self.search_window_size]:
                        if host_region.bwd_prefetch_region is not None:
                            continue

                        temp_ts, profit = self._try_to_offload(host_region, region)

                        if self._compare_profit(profit, max_prefetch_profit):
                            region_to_region_map[region.r_id] = host_region
                            max_prefetch_profit = profit
                            best_pref_ts = temp_ts
                            if profit[0] == float("inf"):
                                break

                    if self._compare_profit(max_prefetch_profit, max_offload_profit):
                        region_to_offload = region
                        max_offload_profit = max_prefetch_profit
                        best_offl_ts = best_pref_ts

            if (region_to_offload is not None) and (best_offl_ts is not None):
                region_to_offload.need_offload = True
                if region_to_region_map[region_to_offload.r_id] == region_to_offload:
                    region_to_offload.is_syn = True
                else:
                    region_to_region_map[region_to_offload.r_id].bwd_prefetch_region = region_to_offload
                    self.region_to_region_map[region_to_offload.r_id] = region_to_region_map[region_to_offload.r_id]

                self._update_state(best_offl_ts)

            elif self.region_to_region_map.__len__() > 0:
                self._repair_strategy()
            else:
                raise NotImplementedError(
                    f"can't find the offload strategy met the memory budget {self.memory_budget / 1024 ** 2} MB, "
                    f"it needs {self.best_ts.peak_mem / 1024 ** 2:.3f} MB at least!"
                )

            region_to_region_map.clear()

    def _try_to_offload(self, host_region: Region, offload_region: Region):
        """
        Attempts to offload the region and prefetch it in backward pass.
        """

        # record previous information
        orig_prefetch = host_region.bwd_prefetch_region
        orig_is_syn = offload_region.is_syn
        orig_need_offload = offload_region.need_offload

        if host_region == offload_region:
            offload_region.is_syn = True
        else:
            host_region.bwd_prefetch_region = offload_region
        offload_region.need_offload = True

        ts, profit = self._eval_one_choice()

        # restore previous information
        host_region.bwd_prefetch_region = orig_prefetch
        offload_region.is_syn = orig_is_syn
        offload_region.need_offload = orig_need_offload

        return ts, profit

    def _try_convert_to_syn_upload(self, host_region: Region, offload_region: Region):
        """
        Attempts to convert asynchronous prefetch into synchronous upload operations.
        """

        # record previous information
        orig_prefetch = host_region.bwd_prefetch_region
        orig_is_syn = offload_region.is_syn
        assert orig_prefetch is not None and not orig_is_syn

        host_region.bwd_prefetch_region = None
        offload_region.is_syn = True

        ts, profit = self._eval_one_choice()

        # restore previous information
        host_region.bwd_prefetch_region = orig_prefetch
        offload_region.is_syn = orig_is_syn

        return ts, profit

    def _repair_strategy(self):
        """
        Repair offload strategy.
        It attempts to convert asynchronous prefetch into synchronous upload operations and selects the best one.
        The repair process does not end until peak memory is reduced or there is no asynchronous prefetch operation.
        """
        print("repair strategy ......")

        peak_mem_saving = 0
        while len(self.region_to_region_map) and peak_mem_saving <= 0:
            max_profit = (0,)
            best_ts = None
            undo_host_region = None
            undo_offload_region = None

            for offload_region_id, host_region in self.region_to_region_map.items():
                offload_region = self.region_list[offload_region_id]
                assert host_region.bwd_prefetch_region == offload_region
                assert offload_region.need_offload
                assert not offload_region.is_syn

                ts, profit = self._try_convert_to_syn_upload(host_region, offload_region)

                if self._compare_profit(profit, max_profit):
                    undo_host_region = host_region
                    undo_offload_region = offload_region
                    max_profit = profit
                    best_ts = ts

            if best_ts is None:
                raise NotImplementedError("repair error!")

            assert not undo_offload_region.is_syn
            undo_offload_region.is_syn = True
            undo_host_region.bwd_prefetch_region = None

            peak_mem_saving = self.best_ts.peak_mem - best_ts.peak_mem

            self._update_state(best_ts)
            self.region_to_region_map.pop(undo_offload_region.r_id)

        return best_ts

    def _eval_one_choice(self):
        """
        Evaluate the profit of a strategy choice.

        Returns:
            AsynTrainingSimulator: the training simulator corresponding to the current strategy.
            tuple: contains memory saving and cost information of the current strategy.
        """

        ts = AsynTrainingSimulator(self.region_list, self.comp_power, self.link_to_bandwidth)
        ts.execute()

        extra_comm_cost = max(ts.iter_end_time - self.best_ts.iter_end_time, 0)
        profit = self._compute_offload_profit(ts.total_mem_saving, self.best_ts.peak_mem - ts.peak_mem, extra_comm_cost)

        return ts, profit


class SolverFactory:
    solvers: Dict[str, Type[Solver]] = {"syn": SynGreedySolver, "asyn": AsynGreedySolver}

    @staticmethod
    def create(solver_name: str) -> Type[Solver]:
        if solver_name not in SolverFactory.solvers:
            raise TypeError(f"Unknown parameter offload policy {solver_name}")
        return SolverFactory.solvers[solver_name]

    @staticmethod
    def get_solver_names():
        return tuple(SolverFactory.solvers.keys())