ColossalAI/colossalai/utils/memory_tracer/memstats_collector.py

from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
from colossalai.utils.memory_utils.utils import colo_device_memory_used
from colossalai.utils import get_current_device

import torch
import time
from typing import List


class SamplingCounter:

    def __init__(self) -> None:
        self._samplint_cnt = 0

    def advance(self):
        self._samplint_cnt += 1

    @property
    def sampling_cnt(self):
        return self._samplint_cnt

    def reset(self):
        self._samplint_cnt = 0


class MemStatsCollector:
    """
    A Memory statistic collector.
    It works in two phases. 
    Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.
    The first iteration of DNN training.
    Phase 2. Runtime Phase: use the read-only collected stats
    The rest iterations of DNN training.
    
    It has a Sampling counter which is reset after DNN training iteration.
    """

    def __init__(self) -> None:
        self._sampling_cnter = SamplingCounter()
        self._model_data_cuda_list = []
        self._overall_cuda_list = []

        self._model_data_cpu_list = []
        self._overall_cpu_list = []

        self._sampling_time = []

        self._start_flag = False

    def overall_mem_stats(self, device_type: str):
        if device_type == 'cuda':
            return self._overall_cuda_list
        elif device_type == 'cpu':
            return self._overall_cpu_list
        else:
            raise TypeError

    def model_data_cuda_list(self, device_type: str, unit: str = 'B') -> List[int]:
        if unit == 'GB':
            scale = 1e9
        elif unit == 'MB':
            scale = 1e6
        elif unit == 'KB':
            scale = 1e3
        elif unit == 'B':
            scale = 1
        else:
            raise TypeError

        if device_type == 'cuda':
            return [elem / scale for elem in self._model_data_cuda_list]
        elif device_type == 'cpu':
            return [elem / scale for elem in self._model_data_cpu_list]
        else:
            raise TypeError

    def non_model_data_cuda_list(self, device_type: str, unit: str = 'B') -> List[int]:
        """Non model data stats
        """
        if unit == 'GB':
            scale = 1e9
        elif unit == 'MB':
            scale = 1e6
        elif unit == 'KB':
            scale = 1e3
        elif unit == 'B':
            scale = 1
        else:
            raise TypeError

        if device_type == 'cuda':
            return [(v1 - v2) / scale for v1, v2 in zip(self._overall_cuda_list, self._model_data_cuda_list)]
        elif device_type == 'cpu':
            return [(v1 - v2) / scale for v1, v2 in zip(self._overall_cpu_list, self._model_data_cpu_list)]
        else:
            raise TypeError

    @property
    def sampling_time(self):
        return [t - self._sampling_time[0] for t in self._sampling_time]

    def start_collection(self):
        self._start_flag = True

    def finish_collection(self):
        self._start_flag = False

    def sample_memstats(self) -> None:
        """
        Sampling memory statistics.
        Record the current model data CUDA memory usage as well as system CUDA memory usage.
        Advance the sampling cnter.
        """
        if self._start_flag:
            sampling_cnt = self._sampling_cnter.sampling_cnt
            assert sampling_cnt == len(self._overall_cuda_list)
            self._model_data_cuda_list.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage)
            self._overall_cuda_list.append(colo_device_memory_used(get_current_device()))

            self._model_data_cpu_list.append(GLOBAL_MODEL_DATA_TRACER.cpu_usage)
            self._overall_cpu_list.append(colo_device_memory_used(torch.device(f'cpu')))

            self._sampling_time.append(time.time())

        self._sampling_cnter.advance()

    def reset_sampling_cnter(self) -> None:
        self._sampling_cnter.reset()

    def clear(self) -> None:
        self._model_data_cuda_list = []
        self._overall_cuda_list = []

        self._model_data_cpu_list = []
        self._overall_cpu_list = []

        self._start_flag = False
        self._sampling_cnter.reset()
[polish] use GLOBAL_MODEL_DATA_TRACER (#417) 3 years ago			`from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER`
[refactor] memory utils (#577) 3 years ago			`from colossalai.utils.memory_utils.utils import colo_device_memory_used`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago			`from colossalai.utils import get_current_device`

			`import torch`
[zero] add sampling time for memstats collector (#610) 3 years ago			`import time`
[refactor] memory utils (#577) 3 years ago			`from typing import List`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago

			`class SamplingCounter:`

			`def __init__(self) -> None:`
			`self._samplint_cnt = 0`

			`def advance(self):`
			`self._samplint_cnt += 1`

			`@property`
			`def sampling_cnt(self):`
			`return self._samplint_cnt`

			`def reset(self):`
			`self._samplint_cnt = 0`


			`class MemStatsCollector:`
[refactor] memory utils (#577) 3 years ago			`"""`
			`A Memory statistic collector.`
			`It works in two phases.`
			`Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.`
			`The first iteration of DNN training.`
			`Phase 2. Runtime Phase: use the read-only collected stats`
			`The rest iterations of DNN training.`

			`It has a Sampling counter which is reset after DNN training iteration.`
			`"""`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
			`def __init__(self) -> None:`
			`self._sampling_cnter = SamplingCounter()`
[refactor] memory utils (#577) 3 years ago			`self._model_data_cuda_list = []`
			`self._overall_cuda_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
[refactor] memory utils (#577) 3 years ago			`self._model_data_cpu_list = []`
			`self._overall_cpu_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
[zero] add sampling time for memstats collector (#610) 3 years ago			`self._sampling_time = []`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago			`self._start_flag = False`

[refactor] memory utils (#577) 3 years ago			`def overall_mem_stats(self, device_type: str):`
			`if device_type == 'cuda':`
			`return self._overall_cuda_list`
			`elif device_type == 'cpu':`
			`return self._overall_cpu_list`
			`else:`
			`raise TypeError`
[zero] dump memory stats for sharded model (#548) 3 years ago
[refactor] memory utils (#577) 3 years ago			`def model_data_cuda_list(self, device_type: str, unit: str = 'B') -> List[int]:`
			`if unit == 'GB':`
			`scale = 1e9`
			`elif unit == 'MB':`
			`scale = 1e6`
			`elif unit == 'KB':`
			`scale = 1e3`
[zero] add sampling time for memstats collector (#610) 3 years ago			`elif unit == 'B':`
			`scale = 1`
[refactor] memory utils (#577) 3 years ago			`else:`
			`raise TypeError`

			`if device_type == 'cuda':`
			`return [elem / scale for elem in self._model_data_cuda_list]`
			`elif device_type == 'cpu':`
			`return [elem / scale for elem in self._model_data_cpu_list]`
			`else:`
			`raise TypeError`

			`def non_model_data_cuda_list(self, device_type: str, unit: str = 'B') -> List[int]:`
[zero] refactor model data tracing (#537) 3 years ago			`"""Non model data stats`
			`"""`
[refactor] memory utils (#577) 3 years ago			`if unit == 'GB':`
			`scale = 1e9`
			`elif unit == 'MB':`
			`scale = 1e6`
			`elif unit == 'KB':`
			`scale = 1e3`
[zero] add sampling time for memstats collector (#610) 3 years ago			`elif unit == 'B':`
			`scale = 1`
			`else:`
			`raise TypeError`
[refactor] memory utils (#577) 3 years ago
			`if device_type == 'cuda':`
			`return [(v1 - v2) / scale for v1, v2 in zip(self._overall_cuda_list, self._model_data_cuda_list)]`
			`elif device_type == 'cpu':`
			`return [(v1 - v2) / scale for v1, v2 in zip(self._overall_cpu_list, self._model_data_cpu_list)]`
			`else:`
			`raise TypeError`
[zero] refactor model data tracing (#537) 3 years ago
[zero] add sampling time for memstats collector (#610) 3 years ago			`@property`
			`def sampling_time(self):`
			`return [t - self._sampling_time[0] for t in self._sampling_time]`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago			`def start_collection(self):`
			`self._start_flag = True`

			`def finish_collection(self):`
			`self._start_flag = False`

			`def sample_memstats(self) -> None:`
			`"""`
			`Sampling memory statistics.`
			`Record the current model data CUDA memory usage as well as system CUDA memory usage.`
[refactor] memory utils (#577) 3 years ago			`Advance the sampling cnter.`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago			`"""`
			`if self._start_flag:`
			`sampling_cnt = self._sampling_cnter.sampling_cnt`
[refactor] memory utils (#577) 3 years ago			`assert sampling_cnt == len(self._overall_cuda_list)`
			`self._model_data_cuda_list.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage)`
			`self._overall_cuda_list.append(colo_device_memory_used(get_current_device()))`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
[refactor] memory utils (#577) 3 years ago			`self._model_data_cpu_list.append(GLOBAL_MODEL_DATA_TRACER.cpu_usage)`
			`self._overall_cpu_list.append(colo_device_memory_used(torch.device(f'cpu')))`

[zero] add sampling time for memstats collector (#610) 3 years ago			`self._sampling_time.append(time.time())`

[refactor] memory utils (#577) 3 years ago			`self._sampling_cnter.advance()`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
			`def reset_sampling_cnter(self) -> None:`
			`self._sampling_cnter.reset()`

			`def clear(self) -> None:`
[refactor] memory utils (#577) 3 years ago			`self._model_data_cuda_list = []`
			`self._overall_cuda_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
[refactor] memory utils (#577) 3 years ago			`self._model_data_cpu_list = []`
			`self._overall_cpu_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 3 years ago
			`self._start_flag = False`
			`self._sampling_cnter.reset()`