2022-12-06 14:10:47 +00:00
|
|
|
from typing import Optional, Tuple
|
|
|
|
|
2022-03-14 14:05:30 +00:00
|
|
|
import torch
|
2022-12-06 14:10:47 +00:00
|
|
|
|
2022-03-24 06:29:41 +00:00
|
|
|
|
2022-03-29 07:45:48 +00:00
|
|
|
def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
|
|
|
|
"""Trace the optimizer memory usage
|
|
|
|
|
|
|
|
Args:
|
2023-06-07 16:01:29 +00:00
|
|
|
optim (ShardedOptimV2): an instance of ShardedOptimizer
|
2022-03-29 07:45:48 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tuple[int, int]: cuda/cpu memory usage in Byte
|
|
|
|
"""
|
|
|
|
if optim is None:
|
|
|
|
return 0, 0
|
2023-09-19 06:20:26 +00:00
|
|
|
assert hasattr(optim, "get_memory_usage"), f"{type(optim)} has no attr get_memory_usage()"
|
2022-03-29 07:45:48 +00:00
|
|
|
return optim.get_memory_usage()
|
|
|
|
|
|
|
|
|
|
|
|
def colo_model_mem_usage(model: torch.nn.Module) -> Tuple[int, int]:
|
2022-12-06 14:10:47 +00:00
|
|
|
"""
|
2022-03-28 08:38:18 +00:00
|
|
|
Trace the model memory usage.
|
|
|
|
Args:
|
|
|
|
model (torch.nn.Module): a torch model
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tuple[int, int]: cuda memory usage in Byte, cpu memory usage in Byte
|
|
|
|
"""
|
2022-03-29 07:45:48 +00:00
|
|
|
if model is None:
|
|
|
|
return 0, 0
|
2022-03-28 08:38:18 +00:00
|
|
|
|
|
|
|
def _get_tensor_mem_use(t: Optional[torch.Tensor]):
|
|
|
|
if t is None:
|
2022-04-03 13:48:06 +00:00
|
|
|
return 0, 0
|
2022-03-28 08:38:18 +00:00
|
|
|
assert isinstance(t, torch.Tensor)
|
|
|
|
_cpu_mem_usage, _cuda_mem_usage = 0, 0
|
2023-09-19 06:20:26 +00:00
|
|
|
if t.device.type == "cpu":
|
2022-03-28 08:38:18 +00:00
|
|
|
_cpu_mem_usage += t.numel() * t.element_size()
|
2023-09-19 06:20:26 +00:00
|
|
|
elif t.device.type == "cuda":
|
2022-03-31 10:34:11 +00:00
|
|
|
_cuda_mem_usage += t.numel() * t.element_size()
|
2022-03-28 08:38:18 +00:00
|
|
|
return _cuda_mem_usage, _cpu_mem_usage
|
|
|
|
|
|
|
|
cuda_mem_usage = 0
|
|
|
|
cpu_mem_usage = 0
|
|
|
|
for param in model.parameters():
|
2023-09-19 06:20:26 +00:00
|
|
|
if hasattr(param, "colo_attr"):
|
2022-03-31 04:25:45 +00:00
|
|
|
t_cuda, t_cpu = param.colo_attr.get_memory_usage()
|
2022-03-29 07:45:48 +00:00
|
|
|
cuda_mem_usage += t_cuda
|
|
|
|
cpu_mem_usage += t_cpu
|
2022-03-28 08:38:18 +00:00
|
|
|
else:
|
|
|
|
t_cuda, t_cpu = _get_tensor_mem_use(param.data)
|
|
|
|
cuda_mem_usage += t_cuda
|
|
|
|
cpu_mem_usage += t_cpu
|
|
|
|
t_cuda, t_cpu = _get_tensor_mem_use(param.grad)
|
|
|
|
cuda_mem_usage += t_cuda
|
|
|
|
cpu_mem_usage += t_cpu
|
|
|
|
|
|
|
|
return cuda_mem_usage, cpu_mem_usage
|