ColossalAI/colossalai/engine/ophooks/_memtracer_ophook.py

from colossalai.context.parallel_mode import ParallelMode
import torch
from colossalai.engine.ophooks import BaseOpHook
from colossalai.registry import OPHOOKS
from colossalai.logging import get_dist_logger
from colossalai.core import global_context as gpc

from colossalai.utils.memory_tracer import AsyncMemoryMonitor

import math


@OPHOOKS.register_module
class MemTracerOpHook(BaseOpHook):
    """
    Collect GPU memory usage information
    :param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50
    :type warmup: int
    :param refreshrate: This parameter decides the frequency of write file, defaults to 10
    :type refreshrate: int
    :param data_prefix: The prefix of the stats data file, defaults to "memstats"
    :type data_prefix: string
    """

    def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):
        super().__init__()
        self.async_mem_monitor = AsyncMemoryMonitor()
        self._curiter = 0
        self._logger = get_dist_logger()
        self._count = 0
        self._warmup = warmup
        self._refreshrate = refreshrate
        self._data_prefix = data_prefix
        # in distributed environment
        if gpc.is_initialized(ParallelMode.GLOBAL):
            self._rank = gpc.get_global_rank()
        else:
            self._rank = 0

    def _isvalid(self, module) -> bool:
        assert isinstance(module, torch.nn.Module)
        return module.training

    def _resample(self):
        # calculate the average iteration time
        total_time = (self.async_mem_monitor.time_stamps[-1] - self.async_mem_monitor.time_stamps[0])
        avg_it_time = total_time / self.warmup
        self._logger.debug(f"total time for {self.warmup} iterations is {total_time}s")
        # adjust the sampling power
        power: int = round(-math.log(avg_it_time, 10)) + 1
        self._logger.debug(f"the power is {power}")
        self.async_mem_monitor.set_interval(power)

    @property
    def refreshrate(self) -> int:
        return self._refreshrate

    @property
    def warmup(self) -> int:
        return self._warmup

    @property
    def curiter(self) -> int:
        return self._curiter

    @property
    def valid_iter(self) -> int:
        return self.curiter - self.warmup

    def pre_fwd_exec(self, module: torch.nn.Module, *args):
        if self._isvalid(module):
            self.async_mem_monitor.finish()
            self.async_mem_monitor.start()

    def post_fwd_exec(self, module: torch.nn.Module, *args):
        if self._isvalid(module):
            self.async_mem_monitor.finish()

    def pre_bwd_exec(self, module: torch.nn.Module, input, output):
        if self._isvalid(module):
            self.async_mem_monitor.finish()
            self.async_mem_monitor.start()

    def post_bwd_exec(self, module: torch.nn.Module, input):
        if self._isvalid(module):
            self.async_mem_monitor.finish()

    def pre_iter(self):
        pass

    def post_iter(self):
        self.async_mem_monitor.finish()
        # in the warmup stage
        if self.curiter < self.warmup:
            pass
        # adjust the sampling rate
        elif self.curiter == self.warmup:
            # use adaptive sample rate
            self._resample()
        # record data to log file
        else:
            # every `refreshrate` times, refresh the file
            if self.valid_iter != 0 and self.valid_iter % self.refreshrate == 0:
                # output file info
                self._logger.info(f"dump a memory statistics as pickle to {self._data_prefix}-{self._rank}.pkl")
                self.save_results()
                self._count += 1
                self._logger.debug(f"data file has been refreshed {self._count} times")
        # finish a iteration
        self._curiter += 1

    def save_results(self):
        datafile = f"{self._data_prefix}-{self._rank}.pkl"
        self.async_mem_monitor.save(datafile)
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`from colossalai.context.parallel_mode import ParallelMode`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00			`import torch`
move async memory to an individual directory (#345) 2022-03-09 08:31:25 +00:00			`from colossalai.engine.ophooks import BaseOpHook`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00			`from colossalai.registry import OPHOOKS`
			`from colossalai.logging import get_dist_logger`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`from colossalai.core import global_context as gpc`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00
move async memory to an individual directory (#345) 2022-03-09 08:31:25 +00:00			`from colossalai.utils.memory_tracer import AsyncMemoryMonitor`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00
move async memory to an individual directory (#345) 2022-03-09 08:31:25 +00:00			`import math`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00

			`@OPHOOKS.register_module`
			`class MemTracerOpHook(BaseOpHook):`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`"""`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`Collect GPU memory usage information`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`:param warmup: This parameter indicates how many iterations to truncate before profiling, defaults to 50`
			`:type warmup: int`
			`:param refreshrate: This parameter decides the frequency of write file, defaults to 10`
			`:type refreshrate: int`
			`:param data_prefix: The prefix of the stats data file, defaults to "memstats"`
			`:type data_prefix: string`
			`"""`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"):`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00			`super().__init__()`
			`self.async_mem_monitor = AsyncMemoryMonitor()`
			`self._curiter = 0`
			`self._logger = get_dist_logger()`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`self._count = 0`
			`self._warmup = warmup`
			`self._refreshrate = refreshrate`
			`self._data_prefix = data_prefix`
			`# in distributed environment`
			`if gpc.is_initialized(ParallelMode.GLOBAL):`
			`self._rank = gpc.get_global_rank()`
			`else:`
			`self._rank = 0`

			`def _isvalid(self, module) -> bool:`
			`assert isinstance(module, torch.nn.Module)`
			`return module.training`

[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`def _resample(self):`
			`# calculate the average iteration time`
			`total_time = (self.async_mem_monitor.time_stamps[-1] - self.async_mem_monitor.time_stamps[0])`
			`avg_it_time = total_time / self.warmup`
			`self._logger.debug(f"total time for {self.warmup} iterations is {total_time}s")`
			`# adjust the sampling power`
			`power: int = round(-math.log(avg_it_time, 10)) + 1`
			`self._logger.debug(f"the power is {power}")`
			`self.async_mem_monitor.set_interval(power)`

[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`@property`
			`def refreshrate(self) -> int:`
			`return self._refreshrate`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`@property`
			`def warmup(self) -> int:`
			`return self._warmup`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`@property`
			`def curiter(self) -> int:`
			`return self._curiter`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`@property`
			`def valid_iter(self) -> int:`
			`return self.curiter - self.warmup`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00
			`def pre_fwd_exec(self, module: torch.nn.Module, *args):`
			`if self._isvalid(module):`
			`self.async_mem_monitor.finish()`
			`self.async_mem_monitor.start()`

			`def post_fwd_exec(self, module: torch.nn.Module, *args):`
			`if self._isvalid(module):`
			`self.async_mem_monitor.finish()`

			`def pre_bwd_exec(self, module: torch.nn.Module, input, output):`
			`if self._isvalid(module):`
			`self.async_mem_monitor.finish()`
			`self.async_mem_monitor.start()`

			`def post_bwd_exec(self, module: torch.nn.Module, input):`
			`if self._isvalid(module):`
			`self.async_mem_monitor.finish()`

			`def pre_iter(self):`
			`pass`

			`def post_iter(self):`
			`self.async_mem_monitor.finish()`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`# in the warmup stage`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`if self.curiter < self.warmup:`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`pass`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`# adjust the sampling rate`
			`elif self.curiter == self.warmup:`
			`# use adaptive sample rate`
			`self._resample()`
			`# record data to log file`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`else:`
			# every `refreshrate` times, refresh the file
			`if self.valid_iter != 0 and self.valid_iter % self.refreshrate == 0:`
			`# output file info`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`self._logger.info(f"dump a memory statistics as pickle to {self._data_prefix}-{self._rank}.pkl")`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`self.save_results()`
			`self._count += 1`
[profiler] add adaptive sampling to memory profiler (#330) * fix merge conflict modify unit test remove unnessesary log info reformat file * remove unused module * remove unnecessary sync function * change doc string style from Google to Sphinx 2022-03-09 03:07:10 +00:00			`self._logger.debug(f"data file has been refreshed {self._count} times")`
[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`# finish a iteration`
add pytorch hooks (#179) * add pytorch hooks fix #175 * remove licenses in src code * add gpu memory tracer * replacing print with logger in ophooks. 2022-01-25 14:20:54 +00:00			`self._curiter += 1`

[profiler] primary memory tracer 2022-03-04 01:35:23 +00:00			`def save_results(self):`
			`datafile = f"{self._data_prefix}-{self._rank}.pkl"`
Flake8 code restyle 2022-03-09 07:17:01 +00:00			`self.async_mem_monitor.save(datafile)`