ColossalAI/colossalai/utils/profiler/comm_profiler.py

import inspect
from pathlib import Path
from functools import partial
import torch
from torch.autograd.profiler import profile
import torch.distributed as dist
from torch.distributed import ReduceOp
from colossalai.utils import get_current_device
from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth
from typing import List, Optional


def _get_code_location(depth: int):
    ret = []
    length = min(len(inspect.stack()), depth + 1)
    for i in range(3, length):
        upper_frame = inspect.stack()[i]
        function_name = inspect.stack()[i - 1].function
        ret.append(upper_frame.filename)
        ret.append('(')
        ret.append(str(upper_frame.lineno))
        ret.append('): ')
        ret.append(function_name)
        if i != length - 1:
            ret.append('\n')

    return ''.join(ret)


torch_all_reduce = dist.all_reduce
torch_all_gather = dist.all_gather
torch_reduce_scatter = dist.reduce_scatter
torch_broadcast = dist.broadcast
torch_reduce = dist.reduce


class CommEvent(object):
    """Communication Event. Used for communication time and communication
    volume recording.
    """

    def __init__(self, count: int = 0, comm_vol: float = 0., cuda_time: int = 0):
        self.self_count = count
        self.self_comm_vol = comm_vol
        self.self_cuda_time = cuda_time

    def add(self, rhs):
        self.self_count += rhs.self_count
        self.self_comm_vol += rhs.self_comm_vol
        self.self_cuda_time += rhs.self_cuda_time


class CommProfiler(BaseProfiler):
    """Communication profiler. Records all communication events.
    """

    def __init__(self, depth: int = 0, total_count: int = 0, total_comm_vol: float = 0, total_cuda_time: int = 0):
        super().__init__(profiler_name="Collective_Communication", priority=0)
        self.depth = 3 + depth
        self.total_count = total_count
        self.total_comm_vol = total_comm_vol
        self.total_cuda_time = total_cuda_time

        self.ops_record = dict()
        self.profiler = None
        self.pending_op = None
        self.pending_metadata = None
        self.warn_flag = False

    def reset(self):
        self.total_count = 0
        self.total_comm_vol = 0
        self.total_cuda_time = 0

        self.ops_record = dict()
        self.profiler = None
        self.pending_op = None
        self.pending_metadata = None
        self.warn_flag = False

    def enable(self):
        dist.all_reduce = partial(all_reduce, profiler=self)
        dist.all_gather = partial(all_gather, profiler=self)
        dist.reduce_scatter = partial(reduce_scatter, profiler=self)
        dist.broadcast = partial(broadcast, profiler=self)
        dist.reduce = partial(reduce, profiler=self)

    def disable(self):
        dist.all_reduce = torch_all_reduce
        dist.all_gather = torch_all_gather
        dist.reduce_scatter = torch_reduce_scatter
        dist.broadcast = torch_broadcast
        dist.reduce = torch_reduce

    def to_tensorboard(self, writer):
        writer.add_text(tag="Collective Communication", text_string=self.result_str("\n\n"))

    def to_file(self, filename: Path):
        with open(filename, "w") as f:
            f.write(self.result_str())

    def show(self):
        print(self.result_str())

    def result_str(self, sep: str = "\n"):
        res = []

        def append(s: str = None):
            if s is not None:
                res.append(s)
            res.append(sep)

        if self.warn_flag:
            append("Warnning: there exists multiple communication operations in the same time. As a result, "
                   "the profiling result is not accurate.")

        if self.total_cuda_time == 0:
            return "No collective communication has been called yet!"

        append("Collective communication profiling result:")
        append("total cuda time: {}".format(_format_time(self.total_cuda_time)))
        append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))
        append("total number of calls: {}".format(self.total_count))
        append("All events:")

        seperation = '-' * 74
        row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2

        append(seperation)
        append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))
        append(seperation)

        show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)
        for location, event in show_list:
            append(location)
            append(
                row_format.format('', _format_time(event.self_cuda_time),
                                  '{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0),
                                  _format_memory(event.self_comm_vol),
                                  _format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count))
            append()

        return ''.join(res)

    @property
    def has_aync_op(self):
        return self.pending_op is not None

    def activate_profiler(self, kn: str, vol: float):
        self.pending_metadata = (kn, _get_code_location(self.depth), vol)
        self.profiler = profile(enabled=True, use_cuda=True, use_cpu=True, use_kineto=True)
        self.profiler.__enter__()

    def close_profiler(self, group=None):
        assert self.profiler is not None, "There is no running dist op"
        kernel_name, code_location, vol = self.pending_metadata
        self.profiler.__exit__(None, None, None)

        if self.profiler.enabled and dist.get_world_size(group) > 1:
            assert_flag = 0
            current_comm_event = None
            events = self.profiler.function_events
            for event in events:
                if kernel_name in event.name:
                    assert assert_flag == 0, "Multiple dist ops has been called "
                    current_comm_event = CommEvent(1, vol, event.self_cuda_time_total)
                    assert_flag += 1

            assert current_comm_event is not None, "dist op has not been found"

            buffer = torch.tensor([current_comm_event.self_cuda_time], device=get_current_device())
            torch_all_reduce(buffer, op=ReduceOp.MIN, group=group)
            current_comm_event.self_cuda_time = buffer.item()

            self.total_count += current_comm_event.self_count
            self.total_comm_vol += current_comm_event.self_comm_vol
            self.total_cuda_time += current_comm_event.self_cuda_time
            if code_location in self.ops_record:
                self.ops_record[code_location].add(current_comm_event)
            else:
                self.ops_record[code_location] = current_comm_event

        self.profiler = None
        self.pending_op = None
        self.pending_metadata = None

    def wait_async_op(self):
        if self.pending_op is not None:
            op = self.pending_op
            op.wait()
            self.close_profiler()


class CommHandler(object):
    """Communication handler. A dummy handler to wait aync operations.
    """

    def __init__(self, profiler: CommProfiler):
        super().__init__()
        self.prof = profiler

    def wait(self):
        self.prof.wait_async_op()


def async_check(profiler: CommProfiler):
    if profiler.pending_op is not None:
        profiler.warn_flag = True
        profiler.wait_async_op()


def all_reduce(tensor: torch.Tensor,
               op: ReduceOp = ReduceOp.SUM,
               group=None,
               async_op: bool = False,
               profiler: CommProfiler = None) -> Optional[CommHandler]:
    async_check(profiler)

    comm_size = dist.get_world_size(group)
    correction = 2 * (comm_size - 1) / comm_size
    comm_vol = correction * tensor.element_size() * tensor.numel()
    profiler.activate_profiler("ncclKernel_AllReduce_", comm_vol)
    profiler.pending_op = torch_all_reduce(tensor, op, group, async_op)

    if async_op:
        return CommHandler(profiler)

    profiler.close_profiler(group)


def reduce_scatter(output: torch.Tensor,
                   input_list: List[torch.Tensor],
                   op: ReduceOp = ReduceOp.SUM,
                   group=None,
                   async_op: bool = False,
                   profiler: CommProfiler = None) -> Optional[CommHandler]:
    async_check(profiler)

    comm_size = dist.get_world_size(group)
    correction = (comm_size - 1) / comm_size
    comm_vol = 0
    for tensor in input_list:
        comm_vol += tensor.element_size() * tensor.numel()
    comm_vol *= correction
    profiler.activate_profiler("ncclKernel_ReduceScatter_", comm_vol)
    profiler.pending_op = torch_reduce_scatter(output, input_list, op, group, async_op)

    if async_op:
        return CommHandler(profiler)

    profiler.close_profiler(group)


def all_gather(tensor_list: List[torch.Tensor],
               tensor: torch.Tensor,
               group=None,
               async_op: bool = False,
               profiler: CommProfiler = None) -> Optional[CommHandler]:
    async_check(profiler)

    comm_size = dist.get_world_size(group)
    correction = (comm_size - 1) / comm_size
    comm_vol = 0
    for ten in tensor_list:
        comm_vol += ten.element_size() * ten.numel()
    comm_vol *= correction
    profiler.activate_profiler("ncclKernel_AllGather_", comm_vol)
    profiler.pending_op = torch_all_gather(tensor_list, tensor, group, async_op)

    if async_op:
        return CommHandler(profiler)

    profiler.close_profiler(group)


def broadcast(tensor: torch.Tensor,
              src: int,
              group=None,
              async_op: bool = False,
              profiler: CommProfiler = None) -> Optional[CommHandler]:
    async_check(profiler)

    comm_vol = 1.0 * tensor.element_size() * tensor.numel()
    profiler.activate_profiler("ncclKernel_Broadcast_", comm_vol)
    profiler.pending_op = torch_broadcast(tensor, src, group, async_op)

    if async_op:
        return CommHandler(profiler)

    profiler.close_profiler(group)


def reduce(tensor: torch.Tensor,
           dst: int,
           op: ReduceOp = ReduceOp.SUM,
           group=None,
           async_op: bool = False,
           profiler: CommProfiler = None) -> Optional[CommHandler]:
    async_check(profiler)

    comm_vol = 1.0 * tensor.element_size() * tensor.numel()
    profiler.activate_profiler("ncclKernel_Reduce_", comm_vol)
    profiler.pending_op = torch_reduce(tensor, dst, op, group, async_op)

    if async_op:
        return CommHandler(profiler)

    profiler.close_profiler(group)
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`import inspect`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`from pathlib import Path`
			`from functools import partial`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`import torch`
			`from torch.autograd.profiler import profile`
			`import torch.distributed as dist`
			`from torch.distributed import ReduceOp`
			`from colossalai.utils import get_current_device`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`from .prof_utils import BaseProfiler, _format_time, _format_memory, _format_bandwidth`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`from typing import List, Optional`


			`def _get_code_location(depth: int):`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`ret = []`
			`length = min(len(inspect.stack()), depth + 1)`
			`for i in range(3, length):`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`upper_frame = inspect.stack()[i]`
			`function_name = inspect.stack()[i - 1].function`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`ret.append(upper_frame.filename)`
			`ret.append('(')`
			`ret.append(str(upper_frame.lineno))`
			`ret.append('): ')`
			`ret.append(function_name)`
			`if i != length - 1:`
			`ret.append('\n')`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`return ''.join(ret)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00

Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`torch_all_reduce = dist.all_reduce`
			`torch_all_gather = dist.all_gather`
			`torch_reduce_scatter = dist.reduce_scatter`
			`torch_broadcast = dist.broadcast`
			`torch_reduce = dist.reduce`


Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`class CommEvent(object):`
			`"""Communication Event. Used for communication time and communication`
			`volume recording.`
			`"""`

			`def __init__(self, count: int = 0, comm_vol: float = 0., cuda_time: int = 0):`
			`self.self_count = count`
			`self.self_comm_vol = comm_vol`
			`self.self_cuda_time = cuda_time`

			`def add(self, rhs):`
			`self.self_count += rhs.self_count`
			`self.self_comm_vol += rhs.self_comm_vol`
			`self.self_cuda_time += rhs.self_cuda_time`


Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`class CommProfiler(BaseProfiler):`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`"""Communication profiler. Records all communication events.`
			`"""`

Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`def __init__(self, depth: int = 0, total_count: int = 0, total_comm_vol: float = 0, total_cuda_time: int = 0):`
			`super().__init__(profiler_name="Collective_Communication", priority=0)`
			`self.depth = 3 + depth`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`self.total_count = total_count`
			`self.total_comm_vol = total_comm_vol`
			`self.total_cuda_time = total_cuda_time`

			`self.ops_record = dict()`
			`self.profiler = None`
			`self.pending_op = None`
			`self.pending_metadata = None`
			`self.warn_flag = False`

			`def reset(self):`
			`self.total_count = 0`
			`self.total_comm_vol = 0`
			`self.total_cuda_time = 0`

			`self.ops_record = dict()`
			`self.profiler = None`
			`self.pending_op = None`
			`self.pending_metadata = None`
			`self.warn_flag = False`

Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`def enable(self):`
			`dist.all_reduce = partial(all_reduce, profiler=self)`
			`dist.all_gather = partial(all_gather, profiler=self)`
			`dist.reduce_scatter = partial(reduce_scatter, profiler=self)`
			`dist.broadcast = partial(broadcast, profiler=self)`
			`dist.reduce = partial(reduce, profiler=self)`

			`def disable(self):`
			`dist.all_reduce = torch_all_reduce`
			`dist.all_gather = torch_all_gather`
			`dist.reduce_scatter = torch_reduce_scatter`
			`dist.broadcast = torch_broadcast`
			`dist.reduce = torch_reduce`

Fixed import bug for no-tensorboard environment (#354) 2022-03-09 11:48:04 +00:00			`def to_tensorboard(self, writer):`
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`writer.add_text(tag="Collective Communication", text_string=self.result_str("\n\n"))`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00
			`def to_file(self, filename: Path):`
			`with open(filename, "w") as f:`
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`f.write(self.result_str())`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`def show(self):`
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`print(self.result_str())`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00
fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`def result_str(self, sep: str = "\n"):`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`res = []`

polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`def append(s: str = None):`
			`if s is not None:`
			`res.append(s)`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`res.append(sep)`

Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`if self.warn_flag:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`append("Warnning: there exists multiple communication operations in the same time. As a result, "`
			`"the profiling result is not accurate.")`

fixed error when no collective communication in CommProfiler 2022-03-14 08:43:21 +00:00			`if self.total_cuda_time == 0:`
			`return "No collective communication has been called yet!"`

Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`append("Collective communication profiling result:")`
			`append("total cuda time: {}".format(_format_time(self.total_cuda_time)))`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`append("average bandwidth: {}".format(_format_bandwidth(self.total_comm_vol, self.total_cuda_time)))`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`append("total number of calls: {}".format(self.total_count))`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`append("All events:")`

			`seperation = '-' * 74`
			`row_format = '{:^10}' + '{:^12}' * 2 + '{:^16}' + '{:^12}' * 2`

			`append(seperation)`
			`append(row_format.format('Location', 'GPU time', 'Percentage', 'Comm volume', 'Bandwidth', 'Num of calls'))`
			`append(seperation)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`show_list = sorted(self.ops_record.items(), key=lambda kv: -kv[1].self_cuda_time)`
			`for location, event in show_list:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`append(location)`
polished output format for communication profiler and pcie profiler (#404) fixed typing error 2022-03-14 08:07:45 +00:00			`append(`
			`row_format.format('', _format_time(event.self_cuda_time),`
			`'{:.1f}%'.format(event.self_cuda_time / self.total_cuda_time * 100.0),`
			`_format_memory(event.self_comm_vol),`
			`_format_bandwidth(event.self_comm_vol, event.self_cuda_time), event.self_count))`
			`append()`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00
			`return ''.join(res)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`@property`
			`def has_aync_op(self):`
			`return self.pending_op is not None`

			`def activate_profiler(self, kn: str, vol: float):`
			`self.pending_metadata = (kn, _get_code_location(self.depth), vol)`
			`self.profiler = profile(enabled=True, use_cuda=True, use_cpu=True, use_kineto=True)`
			`self.profiler.__enter__()`

			`def close_profiler(self, group=None):`
			`assert self.profiler is not None, "There is no running dist op"`
			`kernel_name, code_location, vol = self.pending_metadata`
			`self.profiler.__exit__(None, None, None)`

[profiler] Fixed bugs in CommProfiler and PcieProfiler (#377) 2022-03-10 09:54:55 +00:00			`if self.profiler.enabled and dist.get_world_size(group) > 1:`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`assert_flag = 0`
			`current_comm_event = None`
			`events = self.profiler.function_events`
			`for event in events:`
			`if kernel_name in event.name:`
			`assert assert_flag == 0, "Multiple dist ops has been called "`
			`current_comm_event = CommEvent(1, vol, event.self_cuda_time_total)`
			`assert_flag += 1`

			`assert current_comm_event is not None, "dist op has not been found"`

			`buffer = torch.tensor([current_comm_event.self_cuda_time], device=get_current_device())`
			`torch_all_reduce(buffer, op=ReduceOp.MIN, group=group)`
			`current_comm_event.self_cuda_time = buffer.item()`

			`self.total_count += current_comm_event.self_count`
			`self.total_comm_vol += current_comm_event.self_comm_vol`
			`self.total_cuda_time += current_comm_event.self_cuda_time`
			`if code_location in self.ops_record:`
			`self.ops_record[code_location].add(current_comm_event)`
			`else:`
			`self.ops_record[code_location] = current_comm_event`

			`self.profiler = None`
			`self.pending_op = None`
			`self.pending_metadata = None`

			`def wait_async_op(self):`
			`if self.pending_op is not None:`
			`op = self.pending_op`
			`op.wait()`
			`self.close_profiler()`


			`class CommHandler(object):`
			`"""Communication handler. A dummy handler to wait aync operations.`
			`"""`

Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`def __init__(self, profiler: CommProfiler):`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00			`super().__init__()`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`self.prof = profiler`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`def wait(self):`
			`self.prof.wait_async_op()`


Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`def async_check(profiler: CommProfiler):`
			`if profiler.pending_op is not None:`
			`profiler.warn_flag = True`
			`profiler.wait_async_op()`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00

			`def all_reduce(tensor: torch.Tensor,`
			`op: ReduceOp = ReduceOp.SUM,`
			`group=None,`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`async_op: bool = False,`
			`profiler: CommProfiler = None) -> Optional[CommHandler]:`
			`async_check(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`comm_size = dist.get_world_size(group)`
			`correction = 2 * (comm_size - 1) / comm_size`
			`comm_vol = correction * tensor.element_size() * tensor.numel()`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.activate_profiler("ncclKernel_AllReduce_", comm_vol)`
			`profiler.pending_op = torch_all_reduce(tensor, op, group, async_op)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`if async_op:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`return CommHandler(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.close_profiler(group)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00

			`def reduce_scatter(output: torch.Tensor,`
			`input_list: List[torch.Tensor],`
			`op: ReduceOp = ReduceOp.SUM,`
			`group=None,`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`async_op: bool = False,`
			`profiler: CommProfiler = None) -> Optional[CommHandler]:`
			`async_check(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`comm_size = dist.get_world_size(group)`
			`correction = (comm_size - 1) / comm_size`
			`comm_vol = 0`
			`for tensor in input_list:`
			`comm_vol += tensor.element_size() * tensor.numel()`
			`comm_vol *= correction`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.activate_profiler("ncclKernel_ReduceScatter_", comm_vol)`
			`profiler.pending_op = torch_reduce_scatter(output, input_list, op, group, async_op)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`if async_op:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`return CommHandler(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.close_profiler(group)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00

			`def all_gather(tensor_list: List[torch.Tensor],`
			`tensor: torch.Tensor,`
			`group=None,`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`async_op: bool = False,`
			`profiler: CommProfiler = None) -> Optional[CommHandler]:`
			`async_check(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`comm_size = dist.get_world_size(group)`
			`correction = (comm_size - 1) / comm_size`
			`comm_vol = 0`
			`for ten in tensor_list:`
			`comm_vol += ten.element_size() * ten.numel()`
			`comm_vol *= correction`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.activate_profiler("ncclKernel_AllGather_", comm_vol)`
			`profiler.pending_op = torch_all_gather(tensor_list, tensor, group, async_op)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`if async_op:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`return CommHandler(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.close_profiler(group)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00

Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`def broadcast(tensor: torch.Tensor,`
			`src: int,`
			`group=None,`
			`async_op: bool = False,`
			`profiler: CommProfiler = None) -> Optional[CommHandler]:`
			`async_check(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`comm_vol = 1.0 * tensor.element_size() * tensor.numel()`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.activate_profiler("ncclKernel_Broadcast_", comm_vol)`
			`profiler.pending_op = torch_broadcast(tensor, src, group, async_op)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`if async_op:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`return CommHandler(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.close_profiler(group)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00

			`def reduce(tensor: torch.Tensor,`
			`dst: int,`
			`op: ReduceOp = ReduceOp.SUM,`
			`group=None,`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`async_op: bool = False,`
			`profiler: CommProfiler = None) -> Optional[CommHandler]:`
			`async_check(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`comm_vol = 1.0 * tensor.element_size() * tensor.numel()`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.activate_profiler("ncclKernel_Reduce_", comm_vol)`
			`profiler.pending_op = torch_reduce(tensor, dst, op, group, async_op)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
			`if async_op:`
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`return CommHandler(profiler)`
Added profiler communication operations Fixed bug for learning rate scheduler 2022-03-04 02:17:45 +00:00
Added Profiler Context to manage all profilers (#340) 2022-03-09 08:12:41 +00:00			`profiler.close_profiler(group)`