diff --git a/colossalai/utils/profiler/comm_profiler.py b/colossalai/utils/profiler/comm_profiler.py index d672a8e20..93c72cc65 100644 --- a/colossalai/utils/profiler/comm_profiler.py +++ b/colossalai/utils/profiler/comm_profiler.py @@ -6,20 +6,25 @@ from torch.autograd.profiler import profile import torch.distributed as dist from torch.distributed import ReduceOp from colossalai.utils import get_current_device -from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwith +from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth from typing import List, Optional def _get_code_location(depth: int): - ret = "" - length = len(inspect.stack()) - for i in range(3, min(length, depth + 1)): + ret = [] + length = min(len(inspect.stack()), depth + 1) + for i in range(3, length): upper_frame = inspect.stack()[i] function_name = inspect.stack()[i - 1].function - info = upper_frame.filename + "(" + str(upper_frame.lineno) + "): " + function_name + "\n" - ret += info + ret.append(upper_frame.filename) + ret.append('(') + ret.append(str(upper_frame.lineno)) + ret.append('): ') + ret.append(function_name) + if i != length - 1: + ret.append('\n') - return ret + return ''.join(ret) torch_all_reduce = dist.all_reduce @@ -100,8 +105,9 @@ class CommProfiler(BaseProfiler): def result_list(self, sep: str = "\n"): res = [] - def append(s: str): - res.append(s) + def append(s: str = None): + if s is not None: + res.append(s) res.append(sep) if self.warn_flag: @@ -110,19 +116,26 @@ class CommProfiler(BaseProfiler): append("Collective communication profiling result:") append("total cuda time: {}".format(_format_time(self.total_cuda_time))) - append("average bandwith: {}".format(_format_bandwith(self.total_comm_vol, self.total_cuda_time))) + append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time))) append("total number of calls: {}".format(self.total_count)) - append("All events:\n----------------------------------------") + append("All events:") + + seperation = '-' * 74 + row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2 + + append(seperation) + append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls')) + append(seperation) show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time) for location, event in show_list: append(location) - append("self cuda time: {}".format(_format_time(event.self_cuda_time))) - append("{:.1f}% of total communication time".format(event.self_cuda_time / self.total_cuda_time * 100.0)) - append("self communication volme: {}".format(_format_memory(event.self_comm_vol))) - append("average bandwith: {}".format(_format_bandwith(event.self_comm_vol, event.self_cuda_time))) - append("number of calls: {}".format(event.self_count)) - append("----------------------------------------") + append( + row_format.format('', _format_time(event.self_cuda_time), + '{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0), + _format_memory(event.self_comm_vol), + _format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count)) + append() return ''.join(res) diff --git a/colossalai/utils/profiler/pcie_profiler.py b/colossalai/utils/profiler/pcie_profiler.py index a01a37489..3a9ec95b4 100644 --- a/colossalai/utils/profiler/pcie_profiler.py +++ b/colossalai/utils/profiler/pcie_profiler.py @@ -1,6 +1,6 @@ from pathlib import Path from torch.autograd.profiler import profile -from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwith +from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth from typing import List @@ -24,6 +24,7 @@ def _reduce_location(locations: List[str]) -> str: for lo in locations: ret.append(lo) ret.append("\n") + ret = ret[:-1] return ''.join(ret) @@ -48,18 +49,23 @@ class PcieProfiler(BaseProfiler): TODO: Merge pcie profiler into communication profiler """ - def __init__(self, - dtype: str = "fp32", - depth: int = 1, - total_count: int = 0, - total_pcie_vol: int = 0, - total_cuda_time: int = 0): + def __init__(self, dtype: str = "fp32", depth: int = 1): super().__init__(profiler_name="Pcie", priority=10) self.depth = depth self.data_size = _get_size(dtype) - self.total_count = total_count - self.total_pcie_vol = total_pcie_vol - self.total_cuda_time = total_cuda_time + self.h2d_count = 0 + self.h2d_time = 0 + self.d2h_count = 0 + self.d2h_time = 0 + + self.ops_record = dict() + self.profiler = None + + def reset(self): + self.h2d_count = 0 + self.h2d_time = 0 + self.d2h_count = 0 + self.d2h_time = 0 self.ops_record = dict() self.profiler = None @@ -81,17 +87,20 @@ class PcieProfiler(BaseProfiler): for event in events: if event.name == "aten::copy_": t_shape = event.input_shapes[0] - if len(t_shape) == 0 or event.cuda_time_total == 0: + if len(t_shape) == 0 or event.cuda_time_total == 0 or len(event.stack) == 0: continue current_comm_event = PcieEvent(1, self.data_size * _get_numel(t_shape), event.cuda_time_total) - self.total_count += current_comm_event.count - self.total_pcie_vol += current_comm_event.pcie_vol - self.total_cuda_time += current_comm_event.cuda_time code_location = _reduce_location(event.stack[:self.depth]) if code_location in self.ops_record: self.ops_record[code_location].add(current_comm_event) else: self.ops_record[code_location] = current_comm_event + elif 'Memcpy HtoD' in event.name: + self.h2d_count += 1 + self.h2d_time += event.cuda_time_total + elif 'Memcpy DtoH' in event.name: + self.d2h_count += 1 + self.d2h_time += event.cuda_time_total self.profiler = None @@ -108,24 +117,32 @@ class PcieProfiler(BaseProfiler): def result_list(self, sep: str = "\n"): res = [] - def append(s: str): - res.append(s) + def append(s: str = None): + if s is not None: + res.append(s) res.append(sep) append("Pcie profiling result:") - append("total cuda time: {}".format(_format_time(self.total_cuda_time))) - append("average bandwith: {}".format(_format_bandwith(self.total_pcie_vol, self.total_cuda_time))) - append("total number of calls: {}".format(self.total_count)) - append("All events:\n----------------------------------------") + append("time of data transmission (CPU -> GPU): {}".format(_format_time(self.h2d_time))) + append("number of transmission (CPU -> GPU): {}".format(self.h2d_count)) + append("time of data transmission (GPU -> CPU): {}".format(_format_time(self.d2h_time))) + append("number of transmission (GPU -> CPU): {}".format(self.d2h_count)) + + append("Possible data transmission events in PCIE:") + + seperation = '-' * 62 + row_format = '{:^10}' + '{:^12}' + '{:^16}' + '{:^12}' * 2 + + append(seperation) + append(row_format.format('Location', 'GPU time', 'Trans volume', 'Bandwidth', 'Num of calls')) + append(seperation) show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].cuda_time) for location, event in show_list: append(location) - append("cuda time: {}".format(_format_time(event.cuda_time))) - append("{:.1f}% of total pcie time".format(event.cuda_time / self.total_cuda_time * 100.0)) - append("pcie volme: {}".format(_format_memory(event.pcie_vol))) - append("average bandwith: {}".format(_format_bandwith(event.pcie_vol, event.cuda_time))) - append("number of calls: {}".format(event.count)) - append("----------------------------------------") + append( + row_format.format('', _format_time(event.cuda_time), _format_memory(event.pcie_vol), + _format_bandwidth(event.pcie_vol, event.cuda_time), event.count)) + append() return ''.join(res) diff --git a/colossalai/utils/profiler/prof_utils.py b/colossalai/utils/profiler/prof_utils.py index d71906868..641a514cf 100644 --- a/colossalai/utils/profiler/prof_utils.py +++ b/colossalai/utils/profiler/prof_utils.py @@ -32,7 +32,7 @@ def _format_memory(nbytes): return str(nbytes) + ' B' -def _format_bandwith(volme: float or int, time_us: int): +def _format_bandwidth(volme: float or int, time_us: int): sec_div_mb = (1000.0 / 1024.0)**2 mb_per_sec = volme / time_us * sec_div_mb