ColossalAI/colossalai/legacy/utils/memory.py

import gc
from collections import namedtuple

import psutil
import torch
import torch.distributed as dist
from packaging import version

from colossalai.accelerator import get_accelerator
from colossalai.legacy.core import global_context as gpc
from colossalai.logging import get_dist_logger

_GLOBAL_CUDA_MEM_FRACTION = 1.0
_GLOBAL_CPU_MEM_CAPACITY = -1


def _bytes_to_MB(val, decimal=2):
    """A byte-to-Megabyte converter, default using binary notation.

    :param val: X bytes to convert
    :return: X' MB
    """
    return round(val / (1024 * 1024), decimal)


# copy from PatrickStar
def _get_cpu_memory_info():
    ps_mem_info = namedtuple("ps_mem_info", ["total", "free", "cached", "buffers", "used"])
    try:
        # psutil reads the memory info from /proc/memory_info,
        # which results in returning the host memory instead of
        # that of container.
        # Here we try to read the container memory with method in:
        # https://stackoverflow.com/a/46213331/5163915
        mems = {}
        with open("/sys/fs/cgroup/memory/memory.meminfo", "rb") as f:
            for line in f:
                fields = line.split()
                mems[fields[0]] = int(fields[1]) * 1024
        total = mems[b"MemTotal:"]
        free = mems[b"MemFree:"]
        cached = mems[b"Cached:"]
        buffers = mems[b"Buffers:"]
        used = total - free - cached - buffers
        if used < 0:
            used = total - free
        mem_info = ps_mem_info(total=total, free=free, cached=cached, buffers=buffers, used=used)
    except FileNotFoundError:
        mems = psutil.virtual_memory()
        mem_info = ps_mem_info(
            total=mems.total,
            free=mems.free,
            cached=mems.cached,
            buffers=mems.buffers,
            used=mems.used,
        )
    return mem_info


def report_memory_usage(message, logger=None, report_cpu=False):
    """Calculate and print RAM usage (in GB)

    Args:
        message (str): A prefix message to add in the log.
        logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
        report_cpu (bool, optional): Whether to report CPU memory.

    Raises:
        EnvironmentError: Raise error if no distributed environment has been initialized.
    """
    if not dist.is_initialized():
        raise EnvironmentError("No distributed environment is initialized")

    gpu_allocated = _bytes_to_MB(torch.cuda.memory_allocated())
    gpu_max_allocated = _bytes_to_MB(torch.cuda.max_memory_allocated())
    gpu_cached = _bytes_to_MB(torch.cuda.memory_reserved())
    gpu_max_cached = _bytes_to_MB(torch.cuda.max_memory_reserved())

    full_log = (
        f"{message}: GPU: allocated {gpu_allocated} MB, max allocated {gpu_max_allocated} MB, "
        + f"cached: {gpu_cached} MB, max cached: {gpu_max_cached} MB"
    )

    if report_cpu:
        # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
        gc.collect()
        vm_stats = psutil.virtual_memory()
        vm_used = _bytes_to_MB(vm_stats.total - vm_stats.available)
        full_log += f", CPU Virtual Memory: used = {vm_used} MB, percent = {vm_stats.percent}%"

    if logger is None:
        logger = get_dist_logger()
    logger.info(full_log)

    # get the peak memory to report correct data, so reset the counter for the next call
    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
        torch.cuda.reset_peak_memory_stats()


def colo_device_memory_capacity(device: torch.device) -> int:
    """
    Get the capacity of the memory of the device

    Args:
        device (torch.device): a device

    Returns:
        int: size in byte
    """
    assert isinstance(device, torch.device)
    if device.type == "cpu":
        # In the context of 1-CPU-N-GPU, the memory capacity of the current process is 1/N overall CPU memory.
        return colo_get_cpu_memory_capacity() / gpc.num_processes_on_current_node
    if device.type == "cuda":
        return (
            torch.cuda.get_device_properties(get_accelerator().get_current_device()).total_memory
            * _GLOBAL_CUDA_MEM_FRACTION
        )


def colo_device_memory_used(device: torch.device) -> int:
    """
    Get the device memory on device belonging to the current process.

    Args:
        device (torch.device): a device

    Returns:
        int: memory size in bytes
    """
    if device.type == "cpu":
        mem_info = _get_cpu_memory_info()
        # In the context of 1-CPU-N-GPU, the memory usage of the current process is 1/N CPU memory used.
        # Each process consumes the same amount of memory.
        ret = mem_info.used / gpc.num_processes_on_current_node
        return ret
    elif device.type == "cuda":
        ret: int = torch.cuda.memory_allocated(device)
        # get the peak memory to report correct data, so reset the counter for the next call
        if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
            torch.cuda.reset_peak_memory_stats(device)
        return ret


def colo_set_process_memory_fraction(ratio: float) -> None:
    """colo_set_process_memory_fraction

    set how much cuda memory used on the gpu belonging to the current process.

    Args:
        ratio (float): a ratio between 0. ~ 1.
    """
    if version.parse(torch.__version__) < version.parse("1.8"):
        logger = get_dist_logger("colo_set_process_memory_fraction")
        logger.warning("colo_set_process_memory_fraction failed because torch version is less than 1.8")
        return
    global _GLOBAL_CUDA_MEM_FRACTION
    _GLOBAL_CUDA_MEM_FRACTION = ratio
    torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_accelerator().get_current_device())


def colo_set_cpu_memory_capacity(size: int) -> None:
    global _GLOBAL_CPU_MEM_CAPACITY
    mem_info = _get_cpu_memory_info()
    total_size = mem_info.total
    if size <= total_size:
        _GLOBAL_CPU_MEM_CAPACITY = size
    else:
        _GLOBAL_CPU_MEM_CAPACITY = total_size


def colo_get_cpu_memory_capacity() -> int:
    """
    Get the cpu memory capacity. We may not use all of it.
    Returns:
        int: _description_
    """
    global _GLOBAL_CPU_MEM_CAPACITY
    if _GLOBAL_CPU_MEM_CAPACITY == -1:
        mem_info = _get_cpu_memory_info()
        return mem_info.total
    else:
        return _GLOBAL_CPU_MEM_CAPACITY
[refactor] refactor the memory utils (#715) 3 years ago			`import gc`
			`from collections import namedtuple`

[legacy] clean up legacy code (#4743) * [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci 1 year ago			`import psutil`
			`import torch`
			`import torch.distributed as dist`
[utils] correct cpu memory used and capacity in the context of multi-process (#726) 3 years ago			`from packaging import version`
[refactor] refactor the memory utils (#715) 3 years ago
[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`from colossalai.accelerator import get_accelerator`
[legacy] clean up legacy code (#4743) * [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci 1 year ago			`from colossalai.legacy.core import global_context as gpc`
			`from colossalai.logging import get_dist_logger`

[refactor] refactor the memory utils (#715) 3 years ago			`_GLOBAL_CUDA_MEM_FRACTION = 1.0`
[gemini] APIs to set cpu memory capacity (#809) 3 years ago			`_GLOBAL_CPU_MEM_CAPACITY = -1`
[refactor] refactor the memory utils (#715) 3 years ago

			`def _bytes_to_MB(val, decimal=2):`
			`"""A byte-to-Megabyte converter, default using binary notation.`

			`:param val: X bytes to convert`
			`:return: X' MB`
			`"""`
			`return round(val / (1024 * 1024), decimal)`


			`# copy from PatrickStar`
			`def _get_cpu_memory_info():`
			`ps_mem_info = namedtuple("ps_mem_info", ["total", "free", "cached", "buffers", "used"])`
			`try:`
			`# psutil reads the memory info from /proc/memory_info,`
			`# which results in returning the host memory instead of`
			`# that of container.`
			`# Here we try to read the container memory with method in:`
			`# https://stackoverflow.com/a/46213331/5163915`
			`mems = {}`
			`with open("/sys/fs/cgroup/memory/memory.meminfo", "rb") as f:`
			`for line in f:`
			`fields = line.split()`
			`mems[fields[0]] = int(fields[1]) * 1024`
			`total = mems[b"MemTotal:"]`
			`free = mems[b"MemFree:"]`
			`cached = mems[b"Cached:"]`
			`buffers = mems[b"Buffers:"]`
			`used = total - free - cached - buffers`
			`if used < 0:`
			`used = total - free`
			`mem_info = ps_mem_info(total=total, free=free, cached=cached, buffers=buffers, used=used)`
			`except FileNotFoundError:`
			`mems = psutil.virtual_memory()`
			`mem_info = ps_mem_info(`
			`total=mems.total,`
			`free=mems.free,`
			`cached=mems.cached,`
			`buffers=mems.buffers,`
			`used=mems.used,`
			`)`
			`return mem_info`


			`def report_memory_usage(message, logger=None, report_cpu=False):`
			`"""Calculate and print RAM usage (in GB)`

			`Args:`
			`message (str): A prefix message to add in the log.`
			logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
			`report_cpu (bool, optional): Whether to report CPU memory.`

			`Raises:`
			`EnvironmentError: Raise error if no distributed environment has been initialized.`
			`"""`
[legacy] clean up legacy code (#4743) * [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci 1 year ago			`if not dist.is_initialized():`
[refactor] refactor the memory utils (#715) 3 years ago			`raise EnvironmentError("No distributed environment is initialized")`

			`gpu_allocated = _bytes_to_MB(torch.cuda.memory_allocated())`
			`gpu_max_allocated = _bytes_to_MB(torch.cuda.max_memory_allocated())`
			`gpu_cached = _bytes_to_MB(torch.cuda.memory_reserved())`
			`gpu_max_cached = _bytes_to_MB(torch.cuda.max_memory_reserved())`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`full_log = (`
			`f"{message}: GPU: allocated {gpu_allocated} MB, max allocated {gpu_max_allocated} MB, "`
[refactor] refactor the memory utils (#715) 3 years ago			`+ f"cached: {gpu_cached} MB, max cached: {gpu_max_cached} MB"`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`)`
[refactor] refactor the memory utils (#715) 3 years ago
			`if report_cpu:`
			`# python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports`
			`gc.collect()`
			`vm_stats = psutil.virtual_memory()`
			`vm_used = _bytes_to_MB(vm_stats.total - vm_stats.available)`
			`full_log += f", CPU Virtual Memory: used = {vm_used} MB, percent = {vm_stats.percent}%"`

			`if logger is None:`
			`logger = get_dist_logger()`
			`logger.info(full_log)`

			`# get the peak memory to report correct data, so reset the counter for the next call`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+`
[refactor] refactor the memory utils (#715) 3 years ago			`torch.cuda.reset_peak_memory_stats()`


			`def colo_device_memory_capacity(device: torch.device) -> int:`
			`"""`
			`Get the capacity of the memory of the device`

			`Args:`
			`device (torch.device): a device`

			`Returns:`
			`int: size in byte`
			`"""`
			`assert isinstance(device, torch.device)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if device.type == "cpu":`
[utils] correct cpu memory used and capacity in the context of multi-process (#726) 3 years ago			`# In the context of 1-CPU-N-GPU, the memory capacity of the current process is 1/N overall CPU memory.`
[gemini] APIs to set cpu memory capacity (#809) 3 years ago			`return colo_get_cpu_memory_capacity() / gpc.num_processes_on_current_node`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if device.type == "cuda":`
[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`return (`
			`torch.cuda.get_device_properties(get_accelerator().get_current_device()).total_memory`
			`* _GLOBAL_CUDA_MEM_FRACTION`
			`)`
[refactor] refactor the memory utils (#715) 3 years ago

			`def colo_device_memory_used(device: torch.device) -> int:`
			`"""`
			`Get the device memory on device belonging to the current process.`

			`Args:`
			`device (torch.device): a device`

			`Returns:`
			`int: memory size in bytes`
			`"""`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if device.type == "cpu":`
[refactor] refactor the memory utils (#715) 3 years ago			`mem_info = _get_cpu_memory_info()`
[utils] correct cpu memory used and capacity in the context of multi-process (#726) 3 years ago			`# In the context of 1-CPU-N-GPU, the memory usage of the current process is 1/N CPU memory used.`
			`# Each process consumes the same amount of memory.`
			`ret = mem_info.used / gpc.num_processes_on_current_node`
[refactor] refactor the memory utils (#715) 3 years ago			`return ret`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`elif device.type == "cuda":`
[refactor] refactor the memory utils (#715) 3 years ago			`ret: int = torch.cuda.memory_allocated(device)`
			`# get the peak memory to report correct data, so reset the counter for the next call`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+`
[refactor] refactor the memory utils (#715) 3 years ago			`torch.cuda.reset_peak_memory_stats(device)`
			`return ret`


			`def colo_set_process_memory_fraction(ratio: float) -> None:`
[legacy] clean up legacy code (#4743) * [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci 1 year ago			`"""colo_set_process_memory_fraction`
[refactor] refactor the memory utils (#715) 3 years ago
			`set how much cuda memory used on the gpu belonging to the current process.`

			`Args:`
			`ratio (float): a ratio between 0. ~ 1.`
			`"""`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if version.parse(torch.__version__) < version.parse("1.8"):`
			`logger = get_dist_logger("colo_set_process_memory_fraction")`
			`logger.warning("colo_set_process_memory_fraction failed because torch version is less than 1.8")`
[utils] correct cpu memory used and capacity in the context of multi-process (#726) 3 years ago			`return`
[refactor] refactor the memory utils (#715) 3 years ago			`global _GLOBAL_CUDA_MEM_FRACTION`
			`_GLOBAL_CUDA_MEM_FRACTION = ratio`
[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_accelerator().get_current_device())`
[gemini] APIs to set cpu memory capacity (#809) 3 years ago

			`def colo_set_cpu_memory_capacity(size: int) -> None:`
			`global _GLOBAL_CPU_MEM_CAPACITY`
			`mem_info = _get_cpu_memory_info()`
			`total_size = mem_info.total`
			`if size <= total_size:`
			`_GLOBAL_CPU_MEM_CAPACITY = size`
			`else:`
			`_GLOBAL_CPU_MEM_CAPACITY = total_size`


			`def colo_get_cpu_memory_capacity() -> int:`
			`"""`
			`Get the cpu memory capacity. We may not use all of it.`
			`Returns:`
			`int: _description_`
			`"""`
			`global _GLOBAL_CPU_MEM_CAPACITY`
			`if _GLOBAL_CPU_MEM_CAPACITY == -1:`
			`mem_info = _get_cpu_memory_info()`
			`return mem_info.total`
			`else:`
			`return _GLOBAL_CPU_MEM_CAPACITY`