ColossalAI/colossalai/utils/memory_tracer/memstats_collector.py

from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
from colossalai.utils.memory_utils.utils import colo_device_memory_used
from colossalai.utils import get_current_device
from colossalai.utils.memory_tracer.async_memtracer import AsyncMemoryMonitor
import torch
import time
from typing import List


class SamplingCounter:

    def __init__(self) -> None:
        self._samplint_cnt = 0
        self._max_sampling_cnt = None

    def advance(self):
        self._samplint_cnt += 1

    def next(self):
        assert self._max_sampling_cnt is not None
        return (self._samplint_cnt + 1) % self._max_sampling_cnt

    def current(self):
        return self._samplint_cnt

    def max(self):
        return self._max_sampling_cnt

    def reset(self):
        self._max_sampling_cnt = self._samplint_cnt
        self._samplint_cnt = 0


class MemStatsCollector:
    """
    A Memory statistic collector.
    It works in two phases. 
    Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.
    The first iteration of DNN training.
    Phase 2. Runtime Phase: use the read-only collected stats
    The rest iterations of DNN training.

    It has a Sampling counter which is reset after DNN training iteration.
    """

    def __init__(self) -> None:
        self._sampling_cnter = SamplingCounter()
        self._mem_monitor = AsyncMemoryMonitor()
        self._model_data_cuda_list = []
        self._overall_cuda_list = []

        self._model_data_cpu_list = []
        self._overall_cpu_list = []

        self._non_model_data_cuda_list = []
        self._non_model_data_cpu_list = []
        self._sampling_time = []

        self._start_flag = False

    def overall_mem_stats(self, device_type: str):
        if device_type == 'cuda':
            return self._overall_cuda_list
        elif device_type == 'cpu':
            return self._overall_cpu_list
        else:
            raise TypeError

    def model_data_list(self, device_type: str, unit: str = 'B') -> List[int]:
        if unit == 'GB':
            scale = 1e9
        elif unit == 'MB':
            scale = 1e6
        elif unit == 'KB':
            scale = 1e3
        elif unit == 'B':
            scale = 1
        else:
            raise TypeError

        if device_type == 'cuda':
            return [elem / scale for elem in self._model_data_cuda_list]
        elif device_type == 'cpu':
            return [elem / scale for elem in self._model_data_cpu_list]
        else:
            raise TypeError

    def non_model_data_list(self, device_type: str, unit: str = 'B') -> List[int]:
        """Non model data stats
        """
        if unit == 'GB':
            scale = 1e9
        elif unit == 'MB':
            scale = 1e6
        elif unit == 'KB':
            scale = 1e3
        elif unit == 'B':
            scale = 1
        else:
            raise TypeError

        if device_type == 'cuda':
            return [elem / scale for elem in self._non_model_data_cuda_list]
        elif device_type == 'cpu':
            return [elem / scale for elem in self._non_model_data_cpu_list]
        else:
            raise TypeError

    def current_non_model_data(self, device_type: str) -> int:
        """get the non model data of the current sampling moment
        """
        return self.non_model_data_list(device_type)[self._sampling_cnter.current()]

    def next_non_model_data(self, device_type: str):
        """get the non model data of the next sampling moment
        """
        return self.non_model_data_list(device_type)[self._sampling_cnter.next()]

    @property
    def sampling_time(self):
        return [t - self._sampling_time[0] for t in self._sampling_time]

    def start_collection(self):
        self._start_flag = True
        self._mem_monitor.start()

    def finish_collection(self):
        self._start_flag = False

    def sample_memstats(self) -> None:
        """
        Sampling memory statistics.
        Record the current model data CUDA memory usage as well as system CUDA memory usage.
        Advance the sampling cnter.
        """
        if self._start_flag:
            sampling_cnt = self._sampling_cnter.current()
            assert sampling_cnt == len(self._overall_cuda_list)
            self._model_data_cuda_list.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage)
            self._overall_cuda_list.append(self._mem_monitor.finish())
            self._non_model_data_cuda_list.append(self._model_data_cuda_list[-1] - self._overall_cuda_list[-1])

            self._model_data_cpu_list.append(GLOBAL_MODEL_DATA_TRACER.cpu_usage)
            # FIXME(jiaruifang) cpu sys used should also return from self._mem_monitor()
            self._overall_cpu_list.append(colo_device_memory_used(torch.device(f'cpu')))
            self._non_model_data_cpu_list.append(self._overall_cpu_list[-1] - self._model_data_cpu_list[-1])
            self._sampling_time.append(time.time())
            self._mem_monitor.start()
        # TODO(ver217): refactor sampler
        # print(f'{self._sampling_cnter.current()} / {self._sampling_cnter.max()}, len = {len(self._sampling_time)}')
        self._sampling_cnter.advance()

    def reset_sampling_cnter(self) -> None:
        self._sampling_cnter.reset()
        self._mem_monitor.finish()

    def clear(self) -> None:
        self._model_data_cuda_list = []
        self._overall_cuda_list = []

        self._model_data_cpu_list = []
        self._overall_cpu_list = []

        self._start_flag = False
        self._sampling_cnter.reset()
        self._mem_monitor.finish()
[polish] use GLOBAL_MODEL_DATA_TRACER (#417) 2022-03-15 03:29:46 +00:00			`from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`from colossalai.utils.memory_utils.utils import colo_device_memory_used`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`from colossalai.utils import get_current_device`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`from colossalai.utils.memory_tracer.async_memtracer import AsyncMemoryMonitor`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`import torch`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`import time`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`from typing import List`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00

			`class SamplingCounter:`

			`def __init__(self) -> None:`
			`self._samplint_cnt = 0`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`self._max_sampling_cnt = None`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def advance(self):`
			`self._samplint_cnt += 1`

[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`def next(self):`
			`assert self._max_sampling_cnt is not None`
			`return (self._samplint_cnt + 1) % self._max_sampling_cnt`

[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`def current(self):`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`return self._samplint_cnt`

[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`def max(self):`
			`return self._max_sampling_cnt`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`def reset(self):`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`self._max_sampling_cnt = self._samplint_cnt`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`self._samplint_cnt = 0`


			`class MemStatsCollector:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`"""`
			`A Memory statistic collector.`
			`It works in two phases.`
			`Phase 1. Collection Phase: collect memory usage statistics of CPU and GPU.`
			`The first iteration of DNN training.`
			`Phase 2. Runtime Phase: use the read-only collected stats`
			`The rest iterations of DNN training.`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`It has a Sampling counter which is reset after DNN training iteration.`
			`"""`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def __init__(self) -> None:`
			`self._sampling_cnter = SamplingCounter()`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`self._mem_monitor = AsyncMemoryMonitor()`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cuda_list = []`
			`self._overall_cuda_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cpu_list = []`
			`self._overall_cpu_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`self._non_model_data_cuda_list = []`
			`self._non_model_data_cpu_list = []`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`self._sampling_time = []`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`self._start_flag = False`

[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`def overall_mem_stats(self, device_type: str):`
			`if device_type == 'cuda':`
			`return self._overall_cuda_list`
			`elif device_type == 'cpu':`
			`return self._overall_cpu_list`
			`else:`
			`raise TypeError`
[zero] dump memory stats for sharded model (#548) 2022-03-30 01:38:44 +00:00
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`def model_data_list(self, device_type: str, unit: str = 'B') -> List[int]:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`if unit == 'GB':`
			`scale = 1e9`
			`elif unit == 'MB':`
			`scale = 1e6`
			`elif unit == 'KB':`
			`scale = 1e3`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`elif unit == 'B':`
			`scale = 1`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`else:`
			`raise TypeError`

			`if device_type == 'cuda':`
			`return [elem / scale for elem in self._model_data_cuda_list]`
			`elif device_type == 'cpu':`
			`return [elem / scale for elem in self._model_data_cpu_list]`
			`else:`
			`raise TypeError`

[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`def non_model_data_list(self, device_type: str, unit: str = 'B') -> List[int]:`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`"""Non model data stats`
			`"""`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`if unit == 'GB':`
			`scale = 1e9`
			`elif unit == 'MB':`
			`scale = 1e6`
			`elif unit == 'KB':`
			`scale = 1e3`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`elif unit == 'B':`
			`scale = 1`
			`else:`
			`raise TypeError`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00
			`if device_type == 'cuda':`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`return [elem / scale for elem in self._non_model_data_cuda_list]`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`elif device_type == 'cpu':`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`return [elem / scale for elem in self._non_model_data_cpu_list]`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`else:`
			`raise TypeError`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`def current_non_model_data(self, device_type: str) -> int:`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`"""get the non model data of the current sampling moment`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`"""`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`return self.non_model_data_list(device_type)[self._sampling_cnter.current()]`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00
			`def next_non_model_data(self, device_type: str):`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`"""get the non model data of the next sampling moment`
			`"""`
[zero] initialize a stateful tensor manager (#614) 2022-04-06 08:18:49 +00:00			`return self.non_model_data_list(device_type)[self._sampling_cnter.next()]`

[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`@property`
			`def sampling_time(self):`
			`return [t - self._sampling_time[0] for t in self._sampling_time]`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`def start_collection(self):`
			`self._start_flag = True`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`self._mem_monitor.start()`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def finish_collection(self):`
			`self._start_flag = False`

			`def sample_memstats(self) -> None:`
			`"""`
			`Sampling memory statistics.`
			`Record the current model data CUDA memory usage as well as system CUDA memory usage.`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`Advance the sampling cnter.`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`"""`
			`if self._start_flag:`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`sampling_cnt = self._sampling_cnter.current()`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`assert sampling_cnt == len(self._overall_cuda_list)`
			`self._model_data_cuda_list.append(GLOBAL_MODEL_DATA_TRACER.cuda_usage)`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`self._overall_cuda_list.append(self._mem_monitor.finish())`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`self._non_model_data_cuda_list.append(self._model_data_cuda_list[-1] - self._overall_cuda_list[-1])`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cpu_list.append(GLOBAL_MODEL_DATA_TRACER.cpu_usage)`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`# FIXME(jiaruifang) cpu sys used should also return from self._mem_monitor()`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._overall_cpu_list.append(colo_device_memory_used(torch.device(f'cpu')))`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`self._non_model_data_cpu_list.append(self._overall_cpu_list[-1] - self._model_data_cpu_list[-1])`
[zero] add sampling time for memstats collector (#610) 2022-04-01 06:03:00 +00:00			`self._sampling_time.append(time.time())`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`self._mem_monitor.start()`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`# TODO(ver217): refactor sampler`
			`# print(f'{self._sampling_cnter.current()} / {self._sampling_cnter.max()}, len = {len(self._sampling_time)}')`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._sampling_cnter.advance()`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def reset_sampling_cnter(self) -> None:`
			`self._sampling_cnter.reset()`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`self._mem_monitor.finish()`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`def clear(self) -> None:`
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cuda_list = []`
			`self._overall_cuda_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[refactor] memory utils (#577) 2022-04-01 01:22:33 +00:00			`self._model_data_cpu_list = []`
			`self._overall_cpu_list = []`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`self._start_flag = False`
			`self._sampling_cnter.reset()`
[zero] stateful tensor manager (#687) * [WIP] stateful tensor manager * add eviction strategy * polish code * polish code * polish comment * add unit test * fix sampler bug * polish code * fix max sampling cnt resetting bug * fix sampler bug * polish code * fix bug * fix unit test Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-04-08 09:51:34 +00:00			`self._mem_monitor.finish()`