[utils] add synchronized cuda memory monitor (#740)

2022-04-13 10:50:54 +08:00 · 2022-04-13 10:50:54 +08:00 · 340e59f968
parent e6212f56cd
commit 340e59f968
4 changed files with 149 additions and 110 deletions
--- a/colossalai/trainer/hooks/_mem_tracer_hook.py
+++ b/colossalai/trainer/hooks/_mem_tracer_hook.py
@ -1,9 +1,8 @@
-from cgitb import Hook
 from colossalai.registry import HOOKS
 from torch import Tensor
 from colossalai.trainer.hooks import BaseHook
 from colossalai.utils.memory_tracer import AsyncMemoryMonitor
-from ._metric_hook import LearningRateMetric, MetricHook
+

@HOOKS.register_module
 class MemTraceHook(BaseHook):
@ -11,6 +10,7 @@ class MemTraceHook(BaseHook):
    This hook is used to record memory usage info, and pass to trainer.states
    You can use it as other trainer hook and fetch data from trainer.states['metrics][mode]
    """
+
    def __init__(
        self,
        priority: int = 0,
--- a/colossalai/utils/memory_tracer/init.py
+++ b/colossalai/utils/memory_tracer/init.py
@ -1,4 +1,4 @@
-from .async_memtracer import AsyncMemoryMonitor
+from .memory_monitor import AsyncMemoryMonitor, SyncCudaMemoryMonitor
 from .memstats_collector import MemStatsCollector

-__all__ = ['AsyncMemoryMonitor', 'MemStatsCollector']
+__all__ = ['AsyncMemoryMonitor', 'SyncCudaMemoryMonitor', 'MemStatsCollector']
--- a/colossalai/utils/memory_tracer/async_memtracer.py
+++ b/colossalai/utils/memory_tracer/async_memtracer.py
@ -1,6 +1,7 @@
+from abc import abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from time import sleep, time
-import pickle
+import json

 import torch

@ -8,7 +9,42 @@ from colossalai.utils.memory import colo_device_memory_used
 from colossalai.utils import get_current_device


-class AsyncMemoryMonitor:
+class MemoryMonitor:
+    """Base class for all types of memory monitor.
+    All monitors should have a list called `time_stamps` and a list called `mem_stats`.
+    """
+
+    def __init__(self):
+        self.time_stamps = []
+        self.mem_stats = []
+
+    def __len__(self):
+        return len(self.mem_stats)
+
+    @abstractmethod
+    def start(self):
+        pass
+
+    @abstractmethod
+    def finish(self):
+        pass
+
+    def state_dict(self):
+        return {
+            "time_stamps": self.time_stamps,
+            "mem_stats": self.mem_stats,
+        }
+
+    def save(self, filename):
+        with open(filename, "w") as f:
+            json.dump(self.state_dict(), f)
+
+    def clear(self):
+        self.mem_stats.clear()
+        self.time_stamps.clear()
+
+
+class AsyncMemoryMonitor(MemoryMonitor):
    """
    An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
    at interval of `1/(10**power)` sec.
@ -31,15 +67,15 @@ class AsyncMemoryMonitor:
        async_mem_monitor.finish()
        async_mem_monitor.save('log.pkl')

-
    Args:
        power (int, optional): the power of time interva. Defaults to 10.

-    .. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
+    .. _PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
        https://arxiv.org/abs/2108.05818
    """

    def __init__(self, power: int = 10):
+        super().__init__()
        self.keep_measuring = False

        current_device = get_current_device()
@ -50,11 +86,6 @@ class AsyncMemoryMonitor:
        self.executor = ThreadPoolExecutor(max_workers=1, initializer=_set_cuda_device)
        self.monitor_thread = None
        self.interval = 1 / (10**power)
-        self.time_stamps = []
-        self.mem_stats = []
-
-    def __len__(self):
-        return len(self.mem_stats)

    def set_interval(self, power: int):
        self.clear()
@ -70,8 +101,10 @@ class AsyncMemoryMonitor:
    def finish(self):
        if self.keep_measuring is False:
            return 0
+
        self.keep_measuring = False
        max_usage = self.monitor_thread.result()
+
        self.monitor_thread = None
        self.time_stamps.append(time())
        self.mem_stats.append(max_usage)
@ -87,17 +120,23 @@ class AsyncMemoryMonitor:
            sleep(self.interval)
        return max_usage

-    @property
-    def state_dict(self):
-        return {
-            "time_stamps": self.time_stamps,
-            "mem_stats": self.mem_stats,
-        }

-    def save(self, filename):
-        with open(filename, "wb") as f:
-            pickle.dump(self.state_dict(), f)
+class SyncCudaMemoryMonitor(MemoryMonitor):
+    """
+    A synchronized cuda memory monitor.
+    It only record the maximum allocated cuda memory from start point to finish point.
+    """

-    def clear(self):
-        self.mem_stats.clear()
-        self.time_stamps.clear()
+    def __init__(self, power: int = 10):
+        super().__init__()
+
+    def start(self):
+        torch.cuda.synchronize()
+        torch.cuda.reset_peak_memory_stats()
+
+    def finish(self):
+        torch.cuda.synchronize()
+        self.time_stamps.append(time())
+        max_usage = torch.cuda.max_memory_allocated()
+        self.mem_stats.append(max_usage)
+        return max_usage
--- a/colossalai/utils/memory_tracer/memstats_collector.py
+++ b/colossalai/utils/memory_tracer/memstats_collector.py
@ -1,6 +1,6 @@
 from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
 from colossalai.utils.memory import colo_device_memory_used
-from colossalai.utils.memory_tracer.async_memtracer import AsyncMemoryMonitor
+from colossalai.utils.memory_tracer import AsyncMemoryMonitor
 import torch
 import time
 from typing import List