ColossalAI/colossalai/utils/profiler/pcie_profiler.py

from pathlib import Path
from torch.autograd.profiler import profile
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
from typing import List


def _get_size(dtype: str):
    if dtype == "fp16":
        return 2
    elif dtype == "fp32":
        return 4
    else:
        raise NotImplementedError


def _get_numel(my_list: List[int]) -> int:
    from functools import reduce
    from operator import mul
    return reduce(mul, my_list)


def _reduce_location(locations: List[str]) -> str:
    ret = []
    for lo in locations:
        ret.append(lo)
        ret.append("\n")
    ret = ret[:-1]
    return ''.join(ret)


class PcieEvent(object):
    """Pcie Event.
    """

    def __init__(self, count: int = 0, pcie_vol: int = 0, cuda_time: int = 0):
        self.count = count
        self.pcie_vol = pcie_vol
        self.cuda_time = cuda_time

    def add(self, rhs):
        self.count += rhs.count
        self.pcie_vol += rhs.pcie_vol
        self.cuda_time += rhs.cuda_time


class PcieProfiler(BaseProfiler):
    """Pcie profiler. Records all data transmission between CPU and GPU.

    TODO: Merge pcie profiler into communication profiler
    """

    def __init__(self, dtype: str = "fp32", depth: int = 1):
        super().__init__(profiler_name="Pcie", priority=10)
        self.depth = depth
        self.data_size = _get_size(dtype)
        self.h2d_count = 0
        self.h2d_time = 0
        self.d2h_count = 0
        self.d2h_time = 0

        self.ops_record = dict()
        self.profiler = None

    def reset(self):
        self.h2d_count = 0
        self.h2d_time = 0
        self.d2h_count = 0
        self.d2h_time = 0

        self.ops_record = dict()
        self.profiler = None

    def enable(self):
        self.profiler = profile(enabled=True,
                                use_cuda=True,
                                use_cpu=True,
                                use_kineto=True,
                                record_shapes=True,
                                with_stack=True)
        self.profiler.__enter__()

    def disable(self):
        self.profiler.__exit__(None, None, None)

        if self.profiler.enabled:
            events = self.profiler.function_events
            for event in events:
                if event.name == "aten::copy_":
                    t_shape = event.input_shapes[0]
                    if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0:
                        continue
                    current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)
                    code_location = _reduce_location(event.stack[:self.depth])
                    if code_location in self.ops_record:
                        self.ops_record[code_location].add(current_comm_event)
                    else:
                        self.ops_record[code_location] = current_comm_event
                elif 'Memcpy HtoD' in event.name:
                    self.h2d_count += 1
                    self.h2d_time += event.cuda_time_total
                elif 'Memcpy DtoH' in event.name:
                    self.d2h_count += 1
                    self.d2h_time += event.cuda_time_total

        self.profiler = None

    def to_tensorboard(self, writer):
        writer.add_text(tag="Data Transmission", text_string=self.result_str("\n\n"))

    def to_file(self, filename: Path):
        with open(filename, "w") as f:
            f.write(self.result_str())

    def show(self):
        print(self.result_str())

    def result_str(self, sep: str = "\n"):
        res = []

        def append(s: str = None):
            if s is not None:
                res.append(s)
            res.append(sep)

        append("Pcie profiling result:")
        append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time)))
        append("number of transmission (CPU -> GPU): {}".format(self.h2d_count))
        append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time)))
        append("number of transmission (GPU -> CPU): {}".format(self.d2h_count))

        append("Possible data transmission events in PCIE:")

        seperation = '-' * 62
        row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2

        append(seperation)
        append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))
        append(seperation)

        show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)
        for location, event in show_list:
            append(location)
            append(
                row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol),
                                  _format_bandwidth(event.pcie_vol, event.cuda_time), event.count))
            append()

        return ''.join(res)
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`from pathlib import Path`
			`from torch.autograd.profiler import profile`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`from typing import List`


			`def _get_size(dtype: str):`
			`if dtype == "fp16":`
			`return 2`
			`elif dtype == "fp32":`
			`return 4`
			`else:`
			`raise NotImplementedError`


			`def _get_numel(my_list: List[int]) -> int:`
			`from functools import reduce`
			`from operator import mul`
			`return reduce(mul, my_list)`


			`def _reduce_location(locations: List[str]) -> str:`
			`ret = []`
			`for lo in locations:`
			`ret.append(lo)`
			`ret.append("\n")`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`ret = ret[:-1]`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`return ''.join(ret)`


			`class PcieEvent(object):`
			`"""Pcie Event.`
			`"""`

			`def __init__(self, count: int = 0, pcie_vol: int = 0, cuda_time: int = 0):`
			`self.count = count`
			`self.pcie_vol = pcie_vol`
			`self.cuda_time = cuda_time`

			`def add(self, rhs):`
			`self.count += rhs.count`
			`self.pcie_vol += rhs.pcie_vol`
			`self.cuda_time += rhs.cuda_time`


			`class PcieProfiler(BaseProfiler):`
			`"""Pcie profiler. Records all data transmission between CPU and GPU.`

			`TODO: Merge pcie profiler into communication profiler`
			`"""`

polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`def __init__(self, dtype: str = "fp32", depth: int = 1):`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`super().__init__(profiler_name="Pcie", priority=10)`
			`self.depth = depth`
			`self.data_size = _get_size(dtype)`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`self.h2d_count = 0`
			`self.h2d_time = 0`
			`self.d2h_count = 0`
			`self.d2h_time = 0`

			`self.ops_record = dict()`
			`self.profiler = None`

			`def reset(self):`
			`self.h2d_count = 0`
			`self.h2d_time = 0`
			`self.d2h_count = 0`
			`self.d2h_time = 0`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
			`self.ops_record = dict()`
			`self.profiler = None`

			`def enable(self):`
			`self.profiler = profile(enabled=True,`
			`use_cuda=True,`
			`use_cpu=True,`
			`use_kineto=True,`
			`record_shapes=True,`
			`with_stack=True)`
			`self.profiler.__enter__()`

			`def disable(self):`
			`self.profiler.__exit__(None, None, None)`

			`if self.profiler.enabled:`
			`events = self.profiler.function_events`
			`for event in events:`
[hotfix] fixed bugs in ShardStrategy and PcieProfiler (#394) 2022-03-11 10:12:46 +00:00			`if event.name == "aten::copy_":`
[profiler] Fixed bugs in CommProfiler and PcieProfiler (#377) 2022-03-10 09:54:55 +00:00			`t_shape = event.input_shapes[0]`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0:`
[profiler] Fixed bugs in CommProfiler and PcieProfiler (#377) 2022-03-10 09:54:55 +00:00			`continue`
			`current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total)`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`code_location = _reduce_location(event.stack[:self.depth])`
			`if code_location in self.ops_record:`
			`self.ops_record[code_location].add(current_comm_event)`
			`else:`
			`self.ops_record[code_location] = current_comm_event`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`elif 'Memcpy HtoD' in event.name:`
			`self.h2d_count += 1`
			`self.h2d_time += event.cuda_time_total`
			`elif 'Memcpy DtoH' in event.name:`
			`self.d2h_count += 1`
			`self.d2h_time += event.cuda_time_total`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
			`self.profiler = None`

			`def to_tensorboard(self, writer):`
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`writer.add_text(tag="Data Transmission", text_string=self.result_str("\n\n"))`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
			`def to_file(self, filename: Path):`
			`with open(filename, "w") as f:`
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`f.write(self.result_str())`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
			`def show(self):`
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`print(self.result_str())`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`def result_str(self, sep: str = "\n"):`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`res = []`

polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`def append(s: str = None):`
			`if s is not None:`
			`res.append(s)`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00			`res.append(sep)`

			`append("Pcie profiling result:")`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time)))`
			`append("number of transmission (CPU -> GPU): {}".format(self.h2d_count))`
			`append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time)))`
			`append("number of transmission (GPU -> CPU): {}".format(self.d2h_count))`

			`append("Possible data transmission events in PCIE:")`

			`seperation = '-' * 62`
			`row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2`

			`append(seperation)`
			`append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls'))`
			`append(seperation)`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
			`show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time)`
			`for location, event in show_list:`
			`append(location)`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`append(`
			`row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol),`
			`_format_bandwidth(event.pcie_vol, event.cuda_time), event.count))`
			`append()`
Added PCIE profiler to dectect data transmission (#373) 2022-03-10 08:24:57 +00:00
			`return ''.join(res)`