ColossalAI/colossalai/gemini/memory_tracer/model_data_memtracer.py

from colossalai.context.singleton_meta import SingletonMeta
import torch
from typing import Tuple, Optional
from colossalai.logging import DistributedLogger


def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
    """Trace the optimizer memory usage

    Args:
        optim (ShardedOptimV2): an instance of ShardedOptimver

    Returns:
        Tuple[int, int]: cuda/cpu memory usage in Byte
    """
    if optim is None:
        return 0, 0
    assert hasattr(optim, 'get_memory_usage'), f"{type(optim)} has no attr get_memory_usage()"
    return optim.get_memory_usage()


def colo_model_mem_usage(model: torch.nn.Module) -> Tuple[int, int]:
    """ 
    Trace the model memory usage.
    Args:
        model (torch.nn.Module): a torch model

    Returns:
        Tuple[int, int]: cuda memory usage in Byte, cpu memory usage in Byte
    """
    if model is None:
        return 0, 0

    def _get_tensor_mem_use(t: Optional[torch.Tensor]):
        if t is None:
            return 0, 0
        assert isinstance(t, torch.Tensor)
        _cpu_mem_usage, _cuda_mem_usage = 0, 0
        if t.device.type == 'cpu':
            _cpu_mem_usage += t.numel() * t.element_size()
        elif t.device.type == 'cuda':
            _cuda_mem_usage += t.numel() * t.element_size()
        return _cuda_mem_usage, _cpu_mem_usage

    cuda_mem_usage = 0
    cpu_mem_usage = 0
    for param in model.parameters():
        if hasattr(param, 'colo_attr'):
            t_cuda, t_cpu = param.colo_attr.get_memory_usage()
            cuda_mem_usage += t_cuda
            cpu_mem_usage += t_cpu
        else:
            t_cuda, t_cpu = _get_tensor_mem_use(param.data)
            cuda_mem_usage += t_cuda
            cpu_mem_usage += t_cpu
            t_cuda, t_cpu = _get_tensor_mem_use(param.grad)
            cuda_mem_usage += t_cuda
            cpu_mem_usage += t_cpu

    return cuda_mem_usage, cpu_mem_usage


class ModelDataTracer(metaclass=SingletonMeta):
    """
    A tracer singleton to trace model data usage during runtime.
    You have to register a model on the singleton first.
    """

    def __init__(self) -> None:
        self._logger = DistributedLogger("ModelDataTracer")
        self._model = None
        self._opitimizer = None

    def _get_mem_usage(self) -> Tuple[int, int]:
        """
        get the memory usage of the model registered.
        Returns:
            Tuple[int, int]: cuda, cpu mem usage
        """
        cuda_use_opt, cpu_use_opt = colo_model_optimizer_usage(self._opitimizer)
        cuda_use_model, cpu_use_model = colo_model_mem_usage(self._model)
        return cuda_use_opt + cuda_use_model, cpu_use_opt + cpu_use_model

    def register_model(self, model) -> None:
        if self._model is not None:
            self._logger.warning("ModelDataTracer has already registered a model")
        self._model = model

    def register_optimizer(self, optimizer) -> None:
        if self._opitimizer is not None:
            self._logger.warning("ModelDataTracer has already registered an optimizer")
        self._opitimizer = optimizer

    @property
    def cpu_usage(self):
        _, cpu_usage = self._get_mem_usage()
        return cpu_usage

    @property
    def cuda_usage(self):
        cuda_usage, _ = self._get_mem_usage()
        return cuda_usage

    @property
    def both_mem_usage(self):
        return self._get_mem_usage()


GLOBAL_MODEL_DATA_TRACER = ModelDataTracer()
[polish] polish singleton and global context (#500) 2022-03-23 10:03:39 +00:00			`from colossalai.context.singleton_meta import SingletonMeta`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`import torch`
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`from typing import Tuple, Optional`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`from colossalai.logging import DistributedLogger`
[memory] add model data tensor moving api (#503) 2022-03-24 06:29:41 +00:00

[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`def colo_model_optimizer_usage(optim) -> Tuple[int, int]:`
			`"""Trace the optimizer memory usage`

			`Args:`
			`optim (ShardedOptimV2): an instance of ShardedOptimver`

			`Returns:`
			`Tuple[int, int]: cuda/cpu memory usage in Byte`
			`"""`
			`if optim is None:`
			`return 0, 0`
			`assert hasattr(optim, 'get_memory_usage'), f"{type(optim)} has no attr get_memory_usage()"`
			`return optim.get_memory_usage()`


			`def colo_model_mem_usage(model: torch.nn.Module) -> Tuple[int, int]:`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`"""`
			`Trace the model memory usage.`
			`Args:`
			`model (torch.nn.Module): a torch model`

			`Returns:`
			`Tuple[int, int]: cuda memory usage in Byte, cpu memory usage in Byte`
			`"""`
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`if model is None:`
			`return 0, 0`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00
			`def _get_tensor_mem_use(t: Optional[torch.Tensor]):`
			`if t is None:`
[hotfix] fix a bug in model data stats tracing (#655) 2022-04-03 13:48:06 +00:00			`return 0, 0`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`assert isinstance(t, torch.Tensor)`
			`_cpu_mem_usage, _cuda_mem_usage = 0, 0`
			`if t.device.type == 'cpu':`
			`_cpu_mem_usage += t.numel() * t.element_size()`
			`elif t.device.type == 'cuda':`
[zero] adapt zero for unsharded parameters (#561) * support existing sharded and unsharded parameters in zero * add unitest for moe-zero model init * polish moe gradient handler 2022-03-31 10:34:11 +00:00			`_cuda_mem_usage += t.numel() * t.element_size()`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`return _cuda_mem_usage, _cpu_mem_usage`

			`cuda_mem_usage = 0`
			`cpu_mem_usage = 0`
			`for param in model.parameters():`
[polish] rename col_attr -> colo_attr (#558) 2022-03-31 04:25:45 +00:00			`if hasattr(param, 'colo_attr'):`
			`t_cuda, t_cpu = param.colo_attr.get_memory_usage()`
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`cuda_mem_usage += t_cuda`
			`cpu_mem_usage += t_cpu`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`else:`
			`t_cuda, t_cpu = _get_tensor_mem_use(param.data)`
			`cuda_mem_usage += t_cuda`
			`cpu_mem_usage += t_cpu`
			`t_cuda, t_cpu = _get_tensor_mem_use(param.grad)`
			`cuda_mem_usage += t_cuda`
			`cpu_mem_usage += t_cpu`

			`return cuda_mem_usage, cpu_mem_usage`


[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`class ModelDataTracer(metaclass=SingletonMeta):`
			`"""`
[polish] use GLOBAL_MODEL_DATA_TRACER (#417) 2022-03-15 03:29:46 +00:00			`A tracer singleton to trace model data usage during runtime.`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`You have to register a model on the singleton first.`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`"""`

			`def __init__(self) -> None:`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`self._logger = DistributedLogger("ModelDataTracer")`
			`self._model = None`
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`self._opitimizer = None`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`def _get_mem_usage(self) -> Tuple[int, int]:`
			`"""`
			`get the memory usage of the model registered.`
			`Returns:`
			`Tuple[int, int]: cuda, cpu mem usage`
			`"""`
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`cuda_use_opt, cpu_use_opt = colo_model_optimizer_usage(self._opitimizer)`
			`cuda_use_model, cpu_use_model = colo_model_mem_usage(self._model)`
			`return cuda_use_opt + cuda_use_model, cpu_use_opt + cpu_use_model`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`def register_model(self, model) -> None:`
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`if self._model is not None:`
			`self._logger.warning("ModelDataTracer has already registered a model")`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`self._model = model`
[zero] fix init device bug in zero init context unittest (#516) 2022-03-25 04:24:18 +00:00
[zero] non model data tracing (#545) 2022-03-29 07:45:48 +00:00			`def register_optimizer(self, optimizer) -> None:`
			`if self._opitimizer is not None:`
			`self._logger.warning("ModelDataTracer has already registered an optimizer")`
			`self._opitimizer = optimizer`

[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00			`@property`
			`def cpu_usage(self):`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`_, cpu_usage = self._get_mem_usage()`
			`return cpu_usage`
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
			`@property`
			`def cuda_usage(self):`
[zero] refactor model data tracing (#537) 2022-03-28 08:38:18 +00:00			`cuda_usage, _ = self._get_mem_usage()`
			`return cuda_usage`
[polish] use GLOBAL_MODEL_DATA_TRACER (#417) 2022-03-15 03:29:46 +00:00
[zero] refactor memstats_collector (#746) 2022-04-14 04:01:12 +00:00			`@property`
			`def both_mem_usage(self):`
			`return self._get_mem_usage()`

[polish] use GLOBAL_MODEL_DATA_TRACER (#417) 2022-03-15 03:29:46 +00:00
			`GLOBAL_MODEL_DATA_TRACER = ModelDataTracer()`