ColossalAI/colossalai/gemini/memory_tracer/memstats_collector.py

from colossalai.gemini.memory_tracer import SyncCudaMemoryMonitor
from colossalai.utils.memory import colo_device_memory_used, colo_device_memory_capacity
from colossalai.utils import get_current_device
from colossalai.gemini.stateful_tensor import StatefulTensor
from colossalai.gemini import ChunkManager

import torch
import time
from typing import List


class MemStatsCollector:
    """
    A Memory statistic collector.
    It works in two phases. 
    Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.
    The first iteration of DNN training.
    Phase 2. Runtime Phase: use the read-only collected stats
    The rest iterations of DNN training.

    It has a Sampling counter which is reset after DNN training iteration.
    """

    def __init__(self) -> None:
        self._mem_monitor = SyncCudaMemoryMonitor()
        self._model_data_cuda_list = []
        self._overall_cuda_list = []

        self._model_data_cpu_list = []
        self._overall_cpu_list = []

        self._non_model_data_cuda_list = []
        self._non_model_data_cpu_list = []
        self._sampling_time = []

        self._start_flag = False
        self._step_idx = 0
        self._step_total = 0

    def overall_mem_stats(self, device_type: str) -> List[int]:
        if device_type == 'cuda':
            return self._overall_cuda_list
        elif device_type == 'cpu':
            return self._overall_cpu_list
        else:
            raise TypeError

    def model_data_list(self, device_type: str) -> List[int]:
        if device_type == 'cuda':
            return self._model_data_cuda_list
        elif device_type == 'cpu':
            return self._model_data_cpu_list
        else:
            raise TypeError

    def non_model_data_list(self, device_type: str) -> List[int]:
        if device_type == 'cuda':
            return self._non_model_data_cuda_list
        elif device_type == 'cpu':
            return self._non_model_data_cpu_list
        else:
            raise TypeError

    def next_period_non_model_data_usage(self, device_type: str) -> int:
        """Get max non model data memory usage of current sampling period

        Args:
            device_type (str): device type, can be 'cpu' or 'cuda'.

        Returns:
            int: max non model data memory usage of current sampling period
        """
        assert not self._start_flag, 'Cannot get mem stats info during collection phase.'
        assert self._step_total > 0, 'Cannot get mem stats info before collection phase.'
        next_non_model_data = self.non_model_data_list(device_type)[self._step_idx]
        self._step_idx = (self._step_idx + 1) % self._step_total
        return next_non_model_data

    @property
    def sampling_time(self):
        return [t - self._sampling_time[0] for t in self._sampling_time]

    def start_collection(self):
        self._start_flag = True
        self._mem_monitor.start()

    def finish_collection(self):
        self.sample_overall_data()
        self._step_total = len(self._sampling_time)
        self._start_flag = False
        self._mem_monitor.finish()

    def sample_model_data(self) -> None:
        """Sampling model data statistics.
        """
        if self._start_flag:
            cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']
            cpu_mem = StatefulTensor.GST_MGR.total_mem['cpu']
            self._model_data_cuda_list.append(cuda_mem)
            self._model_data_cpu_list.append(cpu_mem)

    def sample_overall_data(self) -> None:
        """Sampling non model data statistics.
        """
        if self._start_flag:
            # overall data recording is after model data recording
            if len(self._model_data_cuda_list) == 0:
                return

            self._overall_cuda_list.append(self._mem_monitor.finish())
            self._overall_cpu_list.append(colo_device_memory_used(torch.device('cpu')))

            assert len(self._model_data_cuda_list) == len(self._overall_cuda_list)

            self._non_model_data_cuda_list.append(self._overall_cuda_list[-1] - self._model_data_cuda_list[-1])
            self._non_model_data_cpu_list.append(self._overall_cpu_list[-1] - self._model_data_cpu_list[-1])
            self._sampling_time.append(time.time())
            self._mem_monitor.start()

    def clear(self) -> None:
        self._model_data_cuda_list = []
        self._overall_cuda_list = []

        self._model_data_cpu_list = []
        self._overall_cpu_list = []

        self._non_model_data_cpu_list = []
        self._non_model_data_cuda_list = []

        self._start_flag = False
        self._step_idx = 0
        self._step_total = 0


class MemStatsCollectorV2(MemStatsCollector):

    def __init__(self, chunk_manager: ChunkManager) -> None:
        super().__init__()
        self._chunk_manager = chunk_manager

    def sample_model_data(self) -> None:
        """Sampling model data statistics.
        """
        if self._start_flag:
            cuda_mem = self._chunk_manager.total_mem['cuda']
            cpu_mem = self._chunk_manager.total_mem['cpu']
            self._model_data_cuda_list.append(cuda_mem)
            self._model_data_cpu_list.append(cpu_mem)

    @property
    def cuda_margin_mem(self) -> float:
        return colo_device_memory_capacity(get_current_device()) - max(self.overall_mem_stats('cuda'))
[refactor] moving memtracer to gemini (#801) 2022-04-19 02:13:08 +00:00			`from colossalai.gemini.memory_tracer import SyncCudaMemoryMonitor`
[gemini] zero supports gemini (#1093) * add placement policy * add gemini mgr * update mem stats collector * update zero * update zero optim * fix bugs * zero optim monitor os * polish unit test * polish unit test * add assert 2022-06-10 06:48:28 +00:00			`from colossalai.utils.memory import colo_device_memory_used, colo_device_memory_capacity`
			`from colossalai.utils import get_current_device`
[zero] use GeminiMemoryManager when sampling model data (#850) 2022-04-24 09:17:22 +00:00			`from colossalai.gemini.stateful_tensor import StatefulTensor`
[refactor] move chunk and chunkmgr to directory gemini (#1182) 2022-06-29 05:31:02 +00:00			`from colossalai.gemini import ChunkManager`
[refactor] moving memtracer to gemini (#801) 2022-04-19 02:13:08 +00:00
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`import torch`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`import time`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`from typing import List`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00

			`class MemStatsCollector:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`"""`
			`A Memory statistic collector.`
			`It works in two phases.`
			`Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.`
			`The first iteration of DNN training.`
			`Phase 2. Runtime Phase: use the read-only collected stats`
			`The rest iterations of DNN training.`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`It has a Sampling counter which is reset after DNN training iteration.`
			`"""`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def __init__(self) -> None:`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`self._mem_monitor = SyncCudaMemoryMonitor()`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cuda_list = []`
			`self._overall_cuda_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cpu_list = []`
			`self._overall_cpu_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`self._non_model_data_cuda_list = []`
			`self._non_model_data_cpu_list = []`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`self._sampling_time = []`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`self._start_flag = False`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`self._step_idx = 0`
			`self._step_total = 0`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`def overall_mem_stats(self, device_type: str) -> List[int]:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`if device_type == 'cuda':`
			`return self._overall_cuda_list`
			`elif device_type == 'cpu':`
			`return self._overall_cpu_list`
			`else:`
			`raise TypeError`
[zero] dump memory stats for sharded model (#548) 2022-03-30 01:38:44 +00:00
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`def model_data_list(self, device_type: str) -> List[int]:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`if device_type == 'cuda':`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`return self._model_data_cuda_list`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`elif device_type == 'cpu':`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`return self._model_data_cpu_list`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`else:`
			`raise TypeError`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`def non_model_data_list(self, device_type: str) -> List[int]:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`if device_type == 'cuda':`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`return self._non_model_data_cuda_list`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`elif device_type == 'cpu':`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`return self._non_model_data_cpu_list`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`else:`
			`raise TypeError`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`def next_period_non_model_data_usage(self, device_type: str) -> int:`
[zero] refactor memstats collector (#706) * refactor memstats collector * fix disposable * polish code 2022-04-11 02:46:08 +00:00			`"""Get max non model data memory usage of current sampling period`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00
[zero] refactor memstats collector (#706) * refactor memstats collector * fix disposable * polish code 2022-04-11 02:46:08 +00:00			`Args:`
			`device_type (str): device type, can be 'cpu' or 'cuda'.`

			`Returns:`
			`int: max non model data memory usage of current sampling period`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`"""`
[zero] refactor memstats collector (#706) * refactor memstats collector * fix disposable * polish code 2022-04-11 02:46:08 +00:00			`assert not self._start_flag, 'Cannot get mem stats info during collection phase.'`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`assert self._step_total > 0, 'Cannot get mem stats info before collection phase.'`
			`next_non_model_data = self.non_model_data_list(device_type)[self._step_idx]`
			`self._step_idx = (self._step_idx + 1) % self._step_total`
			`return next_non_model_data`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`@property`
			`def sampling_time(self):`
			`return [t - self._sampling_time[0] for t in self._sampling_time]`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`def start_collection(self):`
			`self._start_flag = True`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`self._mem_monitor.start()`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def finish_collection(self):`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`self.sample_overall_data()`
			`self._step_total = len(self._sampling_time)`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`self._start_flag = False`
[zero] refactor memstats collector (#706) * refactor memstats collector * fix disposable * polish code 2022-04-11 02:46:08 +00:00			`self._mem_monitor.finish()`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`def sample_model_data(self) -> None:`
			`"""Sampling model data statistics.`
			`"""`
			`if self._start_flag:`
[zero] use GeminiMemoryManager when sampling model data (#850) 2022-04-24 09:17:22 +00:00			`cuda_mem = StatefulTensor.GST_MGR.total_mem['cuda']`
			`cpu_mem = StatefulTensor.GST_MGR.total_mem['cpu']`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`self._model_data_cuda_list.append(cuda_mem)`
			`self._model_data_cpu_list.append(cpu_mem)`

			`def sample_overall_data(self) -> None:`
			`"""Sampling non model data statistics.`
			`"""`
			`if self._start_flag:`
			`# overall data recording is after model data recording`
			`if len(self._model_data_cuda_list) == 0:`
			`return`

			`self._overall_cuda_list.append(self._mem_monitor.finish())`
			`self._overall_cpu_list.append(colo_device_memory_used(torch.device('cpu')))`

			`assert len(self._model_data_cuda_list) == len(self._overall_cuda_list)`

			`self._non_model_data_cuda_list.append(self._overall_cuda_list[-1] - self._model_data_cuda_list[-1])`
			`self._non_model_data_cpu_list.append(self._overall_cpu_list[-1] - self._model_data_cpu_list[-1])`
			`self._sampling_time.append(time.time())`
			`self._mem_monitor.start()`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`def clear(self) -> None:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cuda_list = []`
			`self._overall_cuda_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cpu_list = []`
			`self._overall_cpu_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[refactor] moving memtracer to gemini (#801) 2022-04-19 02:13:08 +00:00			`self._non_model_data_cpu_list = []`
			`self._non_model_data_cuda_list = []`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`self._start_flag = False`
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`self._step_idx = 0`
			`self._step_total = 0`
[tensor] refactor chunk mgr and impl MemStatsCollectorV2 (#1077) * polish chunk manager * polish unit test * impl add_extern_static_tensor for chunk mgr * add mem stats collector v2 * polish code * polish unit test * polish code * polish get chunks 2022-06-09 12:56:34 +00:00

			`class MemStatsCollectorV2(MemStatsCollector):`

			`def __init__(self, chunk_manager: ChunkManager) -> None:`
			`super().__init__()`
			`self._chunk_manager = chunk_manager`

			`def sample_model_data(self) -> None:`
			`"""Sampling model data statistics.`
			`"""`
			`if self._start_flag:`
			`cuda_mem = self._chunk_manager.total_mem['cuda']`
			`cpu_mem = self._chunk_manager.total_mem['cpu']`
			`self._model_data_cuda_list.append(cuda_mem)`
			`self._model_data_cpu_list.append(cpu_mem)`
[gemini] zero supports gemini (#1093) * add placement policy * add gemini mgr * update mem stats collector * update zero * update zero optim * fix bugs * zero optim monitor os * polish unit test * polish unit test * add assert 2022-06-10 06:48:28 +00:00
			`@property`
			`def cuda_margin_mem(self) -> float:`
			`return colo_device_memory_capacity(get_current_device()) - max(self.overall_mem_stats('cuda'))`