ColossalAI/colossalai/device/alpha_beta_profiler.py

import math
import time
from typing import Dict, List, Tuple

import torch
import torch.distributed as dist

from colossalai.logging import get_dist_logger

GB = int((1 << 30))
BYTE = 4
FRAMEWORK_LATENCY = 0


class AlphaBetaProfiler:
    '''
    Profile alpha and beta value for a given device list.

    Usage:
        # Note: the environment of execution is supposed to be
        # multi-process with multi-gpu in mpi style.
        >>> physical_devices = [0, 1, 4, 5]
        >>> ab_profiler = AlphaBetaProfiler(physical_devices)
        >>> ab_dict = profiler.alpha_beta_dict
        >>> print(ab_dict)
        {(0, 1): (1.9641406834125518e-05, 4.74049549614719e-12), (0, 4): (1.9506998360157013e-05, 6.97421973297474e-11), (0, 5): (2.293858677148819e-05, 7.129930361393644e-11),
         (1, 4): (1.9010603427886962e-05, 7.077968863788975e-11), (1, 5): (1.9807778298854827e-05, 6.928845708992215e-11), (4, 5): (1.8681809306144713e-05, 4.7522367291330524e-12),
         (1, 0): (1.9641406834125518e-05, 4.74049549614719e-12), (4, 0): (1.9506998360157013e-05, 6.97421973297474e-11), (5, 0): (2.293858677148819e-05, 7.129930361393644e-11),
         (4, 1): (1.9010603427886962e-05, 7.077968863788975e-11), (5, 1): (1.9807778298854827e-05, 6.928845708992215e-11), (5, 4): (1.8681809306144713e-05, 4.7522367291330524e-12)}
    '''

    def __init__(self,
                 physical_devices: List[int],
                 alpha_beta_dict: Dict[Tuple[int, int], Tuple[float, float]] = None,
                 ctype: str = 'a',
                 warmup: int = 5,
                 repeat: int = 25,
                 latency_iters: int = 5,
                 homogeneous_tolerance: float = 0.1):
        '''
        Args:
            physical_devices: A list of device id, each element inside it is the global rank of that device.
            alpha_beta_dict: A dict which maps a process group to alpha-beta value pairs.
            ctype: 'a' for all-reduce, 'b' for broadcast.
            warmup: Number of warmup iterations.
            repeat: Number of iterations to measure.
            latency_iters: Number of iterations to measure latency.
        '''
        self.physical_devices = physical_devices
        self.ctype = ctype
        self.world_size = len(physical_devices)
        self.warmup = warmup
        self.repeat = repeat
        self.latency_iters = latency_iters
        self.homogeneous_tolerance = homogeneous_tolerance
        self.process_group_dict = None
        self._init_profiling()
        if alpha_beta_dict is None:
            self.alpha_beta_dict = self.profile_ab()
        else:
            self.alpha_beta_dict = alpha_beta_dict

    def _init_profiling(self):
        # Create process group list based on its global rank
        process_group_list = []
        for f_index in range(self.world_size - 1):
            for b_index in range(f_index + 1, self.world_size):
                process_group_list.append((self.physical_devices[f_index], self.physical_devices[b_index]))

        # Create process group dict which maps process group to its handler
        process_group_dict = {}
        for process_group in process_group_list:
            pg_handler = dist.new_group(process_group)
            process_group_dict[process_group] = pg_handler

        self.process_group_dict = process_group_dict

    def _profile(self, process_group, pg_handler, nbytes):
        logger = get_dist_logger()
        rank = dist.get_rank()
        src_device_num = process_group[0]
        world_size = len(process_group)

        device = torch.cuda.current_device()
        buf = torch.randn(nbytes // 4).to(device)

        torch.cuda.synchronize()
        # warmup
        for _ in range(self.warmup):
            if self.ctype == "a":
                dist.all_reduce(buf, op=dist.ReduceOp.SUM, group=pg_handler)
            elif self.ctype == "b":
                dist.broadcast(buf, src=src_device_num, group=pg_handler)
        torch.cuda.synchronize()

        dist.barrier(group=pg_handler)
        begin = time.perf_counter()
        for _ in range(self.repeat):
            if self.ctype == "a":
                dist.all_reduce(buf, op=dist.ReduceOp.SUM, group=pg_handler)
            elif self.ctype == "b":
                dist.broadcast(buf, src=src_device_num, group=pg_handler)
        torch.cuda.synchronize()
        end = time.perf_counter()
        dist.barrier(group=pg_handler)

        if rank == src_device_num:
            avg_time_s = (end - begin) / self.repeat - FRAMEWORK_LATENCY
            alg_band = nbytes / avg_time_s
            if self.ctype == "a":
                # convert the bandwidth of all-reduce algorithm to the bandwidth of the hardware.
                bus_band = 2 * (world_size - 1) / world_size * alg_band
                bus_band = alg_band
            elif self.ctype == "b":
                bus_band = alg_band

            logger.info(
                f"GPU:{rank}, Bytes: {nbytes} B,Time: {round(avg_time_s * 1e6,2)} us, Bus bandwidth: {round(bus_band / GB,2)} GB/s"
            )
            return (avg_time_s, alg_band)
        else:
            # Just a placeholder
            return (None, None)

    def profile_latency(self, process_group, pg_handler):
        '''
        This function is used to profile the latency of the given process group with a series of bytes.

        Args:
            process_group: A tuple of global rank of the process group.
            pg_handler: The handler of the process group.

        Returns:
            latency: None if the latency is not measured, otherwise the median of the latency_list.
        '''
        latency_list = []
        for i in range(self.latency_iters):
            nbytes = int(BYTE << i)
            (t, _) = self._profile(process_group, pg_handler, nbytes)
            latency_list.append(t)

        if latency_list[0] is None:
            latency = None
        else:
            median_index = math.floor(self.latency_iters / 2)
            latency = latency_list[median_index]

        return latency

    def profile_bandwidth(self, process_group, pg_handler, maxbytes=(1 * GB)):
        '''
        This function is used to profile the bandwidth of the given process group.

        Args:
            process_group: A tuple of global rank of the process group.
            pg_handler: The handler of the process group.
        '''
        (_, bandwidth) = self._profile(process_group, pg_handler, maxbytes)
        return bandwidth

    def profile_ab(self):
        '''
        This method is used to profiling the alpha and beta value for a given device list.

        Returns:
            alpha_beta_dict: A dict which maps process group to its alpha and beta value.
        '''
        alpha_beta_dict: Dict[Tuple[int], Tuple[float]] = {}
        rank = dist.get_rank()
        global_pg_handler = dist.new_group(self.physical_devices)

        def get_max_nbytes(process_group: Tuple[int], pg_handler: dist.ProcessGroup):
            assert rank in process_group
            device = torch.cuda.current_device()
            rank_max_nbytes = torch.cuda.mem_get_info(device)[0]
            rank_max_nbytes = torch.tensor(rank_max_nbytes, device=device)
            dist.all_reduce(rank_max_nbytes, op=dist.ReduceOp.MIN, group=pg_handler)
            max_nbytes = min(int(1 * GB), int(GB << int(math.log2(rank_max_nbytes.item() / GB))))
            return max_nbytes

        for process_group, pg_handler in self.process_group_dict.items():
            if rank not in process_group:
                max_nbytes = None
                alpha = None
                bandwidth = None
            else:
                max_nbytes = get_max_nbytes(process_group, pg_handler)
                alpha = self.profile_latency(process_group, pg_handler)
                bandwidth = self.profile_bandwidth(process_group, pg_handler, maxbytes=max_nbytes)

            if bandwidth is None:
                beta = None
            else:
                beta = 1 / bandwidth

            broadcast_list = [alpha, beta]
            dist.broadcast_object_list(broadcast_list, src=process_group[0])
            alpha_beta_dict[process_group] = tuple(broadcast_list)

        # add symmetry pair to the alpha_beta_dict
        symmetry_ab_dict = {}
        for process_group, alpha_beta_pair in alpha_beta_dict.items():
            symmetry_process_group = (process_group[1], process_group[0])
            symmetry_ab_dict[symmetry_process_group] = alpha_beta_pair

        alpha_beta_dict.update(symmetry_ab_dict)

        return alpha_beta_dict

    def search_best_logical_mesh(self):
        '''
        This method is used to search the best logical mesh for the given device list.

        The best logical mesh is searched in following steps:
            1. detect homogeneous device groups, we assume that the devices in the alpha_beta_dict
                are homogeneous if the beta value is close enough.
            2. Find the best homogeneous device group contains all the physical devices. The best homogeneous
                device group means the lowest beta value in the groups which contains all the physical devices.
                And the reason we require the group contains all the physical devices is that the devices not in
                the group will decrease the bandwidth of the group.
            3. If the best homogeneous device group is found, we will construct the largest ring for each device
                based on the best homogeneous device group, and the best logical mesh will be the union of all the
                rings. Otherwise, the best logical mesh will be the balanced logical mesh, such as shape (2, 2) for
                4 devices.

        Returns:
            best_logical_mesh: The best logical mesh for the given device list.

        Usage:
            >>> physical_devices = [0, 1, 2, 3]
            >>> ab_profiler = AlphaBetaProfiler(physical_devices)
            >>> best_logical_mesh = profiler.search_best_logical_mesh()
            >>> print(best_logical_mesh)
            [[0, 1], [2, 3]]
        '''

        def _power_of_two(integer):
            return integer & (integer - 1) == 0

        def _detect_homogeneous_device(alpha_beta_dict):
            '''
            This function is used to detect whether the devices in the alpha_beta_dict are homogeneous.

            Note: we assume that the devices in the alpha_beta_dict are homogeneous if the beta value
                of the devices are in range of [(1 - self.homogeneous_tolerance), (1 + self.homogeneous_tolerance)]
                * base_beta.
            '''
            homogeneous_device_dict: Dict[float, List[Tuple[int]]] = {}
            for process_group, (_, beta) in alpha_beta_dict.items():
                if homogeneous_device_dict is None:
                    homogeneous_device_dict[beta] = []
                    homogeneous_device_dict[beta].append(process_group)

                match_beta = None
                for beta_value in homogeneous_device_dict.keys():
                    if beta <= beta_value * (1 + self.homogeneous_tolerance) and beta >= beta_value * (
                            1 - self.homogeneous_tolerance):
                        match_beta = beta_value
                        break

                if match_beta is not None:
                    homogeneous_device_dict[match_beta].append(process_group)
                else:
                    homogeneous_device_dict[beta] = []
                    homogeneous_device_dict[beta].append(process_group)

            return homogeneous_device_dict

        def _check_contain_all_devices(homogeneous_group: List[Tuple[int]]):
            '''
            This function is used to check whether the homogeneous_group contains all physical devices.
            '''
            flatten_mesh = []
            for process_group in homogeneous_group:
                flatten_mesh.extend(process_group)
            non_duplicated_flatten_mesh = set(flatten_mesh)
            return len(non_duplicated_flatten_mesh) == len(self.physical_devices)

        def _construct_largest_ring(homogeneous_group: List[Tuple[int]]):
            '''
            This function is used to construct the largest ring in the homogeneous_group for each rank.
            '''
            # Construct the ring
            ring = []
            ranks_in_ring = []
            for rank in self.physical_devices:
                if rank in ranks_in_ring:
                    continue
                stable_status = False
                ring_for_rank = []
                ring_for_rank.append(rank)
                check_rank_list = [rank]
                rank_to_check_list = []

                while not stable_status:
                    stable_status = True
                    check_rank_list.extend(rank_to_check_list)
                    rank_to_check_list = []
                    for i in range(len(check_rank_list)):
                        check_rank = check_rank_list.pop()
                        for process_group in homogeneous_group:
                            if check_rank in process_group:
                                rank_to_append = process_group[0] if process_group[1] == check_rank else process_group[1]
                                if rank_to_append not in ring_for_rank:
                                    stable_status = False
                                    rank_to_check_list.append(rank_to_append)
                                    ring_for_rank.append(rank_to_append)

                ring.append(ring_for_rank)
                ranks_in_ring.extend(ring_for_rank)

            return ring

        assert _power_of_two(self.world_size)
        power_of_two = int(math.log2(self.world_size))
        median = power_of_two // 2
        balanced_logical_mesh_shape = (2**median, 2**(power_of_two - median))
        row_size, column_size = balanced_logical_mesh_shape[0], balanced_logical_mesh_shape[1]
        balanced_logical_mesh = []
        for row_index in range(row_size):
            balanced_logical_mesh.append([])
            for column_index in range(column_size):
                balanced_logical_mesh[row_index].append(self.physical_devices[row_index * column_size + column_index])

        homogeneous_device_dict = _detect_homogeneous_device(self.alpha_beta_dict)
        beta_list = [b for b in homogeneous_device_dict.keys()]
        beta_list.sort()
        beta_list.reverse()
        homogeneous_types = len(beta_list)
        best_logical_mesh = None
        if homogeneous_types >= 2:
            for _ in range(homogeneous_types - 1):
                lowest_beta = beta_list.pop()
                best_homogeneous_group = homogeneous_device_dict[lowest_beta]
                # if the best homogeneous group contains all physical devices,
                # we will build the logical device mesh based on it. Otherwise,
                # we will check next level homogeneous group.
                if _check_contain_all_devices(best_homogeneous_group):
                    # We choose the largest ring for each rank to maximum the best bus utilization.
                    best_logical_mesh = _construct_largest_ring(best_homogeneous_group)
                    break

        if homogeneous_types == 1 or best_logical_mesh is None:
            # in this case, we use balanced logical mesh as the best
            # logical mesh.
            best_logical_mesh = balanced_logical_mesh

        return best_logical_mesh

    def extract_alpha_beta_for_device_mesh(self):
        '''
        Extract the mesh_alpha list and mesh_beta list based on the
            best logical mesh, which will be used to initialize the device mesh.

        Usage:
            >>> physical_devices = [0, 1, 2, 3]
            >>> ab_profiler = AlphaBetaProfiler(physical_devices)
            >>> mesh_alpha, mesh_beta = profiler.extract_alpha_beta_for_device_mesh()
            >>> print(mesh_alpha)
            [2.5917552411556242e-05, 0.00010312341153621673]
            >>> print(mesh_beta)
            [5.875573704655635e-11, 4.7361584445959614e-12]
        '''
        best_logical_mesh = self.search_best_logical_mesh()

        first_axis = [row[0] for row in best_logical_mesh]
        second_axis = best_logical_mesh[0]

        # init process group for both axes
        first_axis_process_group = dist.new_group(first_axis)
        second_axis_process_group = dist.new_group(second_axis)

        # extract alpha and beta for both axes
        def _extract_alpha_beta(pg, pg_handler):
            latency = self.profile_latency(pg, pg_handler)
            bandwidth = self.profile_bandwidth(pg, pg_handler)
            broadcast_object = [latency, bandwidth]
            dist.broadcast_object_list(broadcast_object, src=pg[0])
            return broadcast_object

        first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
        second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
        mesh_alpha = [first_latency, second_latency]
        # The beta values have been enlarged by 1e10 times temporarily because the computation cost
        # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
        mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]

        return mesh_alpha, mesh_beta