ColossalAI/colossalai/utils/memory_utils/utils.py

import torch
from colossalai.utils import get_current_device
from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor

from typing import Tuple, Union

_GLOBAL_CUDA_MEM_FRACTION = 1.0


def colo_tensor_mem_usage(tensor: Union[torch.Tensor, ShardedTensor]) -> Tuple[int, int]:
    if isinstance(tensor, ShardedTensor):
        t = tensor.payload
    elif isinstance(tensor, torch.Tensor):
        t = tensor
    else:
        return 0, 0

    cuda_use, cpu_use = 0, 0

    mem_use = t.numel() * t.element_size()
    if t.device.type == 'cuda':
        cuda_use += mem_use
    elif t.device.type == 'cpu':
        cpu_use += mem_use

    return cuda_use, cpu_use


def colo_set_process_memory_fraction(ratio: float) -> None:
    """colo_set_process_memory_fraction 

    set how much cuda memory used on the gpu belonging to the current process.

    Args:
        ratio (float): a ratio between 0. ~ 1.
    """
    global _GLOBAL_CUDA_MEM_FRACTION
    _GLOBAL_CUDA_MEM_FRACTION = ratio
    torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_current_device())


def colo_cuda_memory_capacity() -> float:
    """
    Get cuda memory capacity of the current cuda.
    """
    return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION


def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t: Union[ShardedTensor,
                                                                                        torch.Tensor]) -> None:
    """ 
    A colossal API for model data tensor move. 
    The src and target tensors could be resident on both CPU and GPU.
    
    NOTE() The source tensor payload will be removed after this function.
    
    The function will record the communication volume between CPU and GPU.
    Args:
        t_src (Union[ShardedTensor, torch.Tensor]): source tensor
        tgt_t (Union[ShardedTensor, torch.Tensor]): target tensor
    """
    if isinstance(src_t, ShardedTensor):
        src_t_payload = src_t.payload
    else:
        src_t_payload = src_t.data
    src_dev = src_t_payload.device
    if isinstance(tgt_t, ShardedTensor):
        tgt_t_payload = tgt_t.payload
    else:
        tgt_t_payload = tgt_t.data
    tgt_dev = tgt_t_payload.device

    tgt_t_payload.copy_(src_t_payload)

    # remove payload of src_t
    if isinstance(src_t, ShardedTensor):
        src_t.reset_payload(torch.tensor([], device=src_dev, dtype=src_t_payload.dtype))
    else:
        src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)


def colo_model_data_tensor_move_inline(t: Union[ShardedTensor, torch.Tensor],
                                       target_device: torch.device,
                                       use_tracer: bool = True) -> None:
    """ 
    move a tensor to the target_device
    Args:
        t (Union[ShardedTensor, torch.Tensor]): the tensor be moved
    """

    if isinstance(t, ShardedTensor):
        t_payload = t.payload
    elif isinstance(t, torch.Tensor):
        t_payload = t
    else:
        raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')

    assert isinstance(target_device, torch.device)

    # deal with torch.device('cpu') and torch.device('cpu:0)
    if t_payload.device.type == target_device.type:
        return
    t_payload.data = t_payload.data.to(target_device)


def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None:
    """colo_model_data_move_to_cpu 

    move a model data tensor from gpu to cpu

    Args:
        t (Union[ShardedTensor, torch.Tensor]): _description_
    """

    if isinstance(t, ShardedTensor):
        t_payload = t.payload
    elif isinstance(t, torch.Tensor):
        t_payload = t
    else:
        raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')

    if t_payload.device.type == 'cpu':
        return

    # TODO() optimize the tensor moving with non-blocking
    t_payload.data = t_payload.data.cpu()


def colo_model_tensor_clone(t: Union[ShardedTensor, torch.Tensor], target_device: torch.device) -> torch.Tensor:
    """
    Clone a model data tensor

    Args:
        t (Union[ShardedTensor, torch.Tensor]): a model data tensor
        target_device (torch.device): the target device
    Returns:
        torch.Tensor: a cloned torch tensor
    """
    t_payload = t.payload if isinstance(t, ShardedTensor) else t

    ret = t_payload.to(target_device)
    return ret
[memory] add model data tensor moving api (#503) 3 years ago			`import torch`
			`from colossalai.utils import get_current_device`
			`from colossalai.zero.sharded_param.sharded_tensor import ShardedTensor`

[zero] get memory usage of sharded optim v2. (#542) 3 years ago			`from typing import Tuple, Union`
[memory] add model data tensor moving api (#503) 3 years ago
[memory] set cuda mem frac (#506) 3 years ago			`_GLOBAL_CUDA_MEM_FRACTION = 1.0`
[memory] add model data tensor moving api (#503) 3 years ago
[memory] set cuda mem frac (#506) 3 years ago
[zero] get memory usage of sharded optim v2. (#542) 3 years ago			`def colo_tensor_mem_usage(tensor: Union[torch.Tensor, ShardedTensor]) -> Tuple[int, int]:`
			`if isinstance(tensor, ShardedTensor):`
			`t = tensor.payload`
			`elif isinstance(tensor, torch.Tensor):`
			`t = tensor`
			`else:`
			`return 0, 0`

			`cuda_use, cpu_use = 0, 0`

			`mem_use = t.numel() * t.element_size()`
			`if t.device.type == 'cuda':`
			`cuda_use += mem_use`
			`elif t.device.type == 'cpu':`
			`cpu_use += mem_use`

			`return cuda_use, cpu_use`


[memory] set cuda mem frac (#506) 3 years ago			`def colo_set_process_memory_fraction(ratio: float) -> None:`
			`"""colo_set_process_memory_fraction`

			`set how much cuda memory used on the gpu belonging to the current process.`

			`Args:`
			`ratio (float): a ratio between 0. ~ 1.`
			`"""`
			`global _GLOBAL_CUDA_MEM_FRACTION`
			`_GLOBAL_CUDA_MEM_FRACTION = ratio`
			`torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_current_device())`


			`def colo_cuda_memory_capacity() -> float:`
[memory] add model data tensor moving api (#503) 3 years ago			`"""`
			`Get cuda memory capacity of the current cuda.`
			`"""`
[memory] set cuda mem frac (#506) 3 years ago			`return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION`
[memory] add model data tensor moving api (#503) 3 years ago

			`def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t: Union[ShardedTensor,`
			`torch.Tensor]) -> None:`
			`"""`
			`A colossal API for model data tensor move.`
			`The src and target tensors could be resident on both CPU and GPU.`

			`NOTE() The source tensor payload will be removed after this function.`

			`The function will record the communication volume between CPU and GPU.`
			`Args:`
			`t_src (Union[ShardedTensor, torch.Tensor]): source tensor`
			`tgt_t (Union[ShardedTensor, torch.Tensor]): target tensor`
			`"""`
			`if isinstance(src_t, ShardedTensor):`
			`src_t_payload = src_t.payload`
			`else:`
			`src_t_payload = src_t.data`
			`src_dev = src_t_payload.device`
			`if isinstance(tgt_t, ShardedTensor):`
			`tgt_t_payload = tgt_t.payload`
			`else:`
			`tgt_t_payload = tgt_t.data`
			`tgt_dev = tgt_t_payload.device`

			`tgt_t_payload.copy_(src_t_payload)`

			`# remove payload of src_t`
			`if isinstance(src_t, ShardedTensor):`
			`src_t.reset_payload(torch.tensor([], device=src_dev, dtype=src_t_payload.dtype))`
			`else:`
			`src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)`


[zero] refactor model data tracing (#522) 3 years ago			`def colo_model_data_tensor_move_inline(t: Union[ShardedTensor, torch.Tensor],`
			`target_device: torch.device,`
			`use_tracer: bool = True) -> None:`
[zero] add colo move inline (#521) 3 years ago			`"""`
			`move a tensor to the target_device`
			`Args:`
			`t (Union[ShardedTensor, torch.Tensor]): the tensor be moved`
			`"""`

			`if isinstance(t, ShardedTensor):`
			`t_payload = t.payload`
			`elif isinstance(t, torch.Tensor):`
			`t_payload = t`
			`else:`
			`raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')`

			`assert isinstance(target_device, torch.device)`

			`# deal with torch.device('cpu') and torch.device('cpu:0)`
			`if t_payload.device.type == target_device.type:`
			`return`
			`t_payload.data = t_payload.data.to(target_device)`


[memory] set cuda mem frac (#506) 3 years ago			`def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None:`
			`"""colo_model_data_move_to_cpu`

			`move a model data tensor from gpu to cpu`

			`Args:`
			`t (Union[ShardedTensor, torch.Tensor]): _description_`
			`"""`

			`if isinstance(t, ShardedTensor):`
			`t_payload = t.payload`
			`elif isinstance(t, torch.Tensor):`
			`t_payload = t`
			`else:`
			`raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')`

			`if t_payload.device.type == 'cpu':`
[memory] add model data tensor moving api (#503) 3 years ago			`return`

[memory] set cuda mem frac (#506) 3 years ago			`# TODO() optimize the tensor moving with non-blocking`
			`t_payload.data = t_payload.data.cpu()`
[zero] fix grad offload (#528) * [zero] fix grad offload * polish code 3 years ago

			`def colo_model_tensor_clone(t: Union[ShardedTensor, torch.Tensor], target_device: torch.device) -> torch.Tensor:`
			`"""`
			`Clone a model data tensor`

			`Args:`
			`t (Union[ShardedTensor, torch.Tensor]): a model data tensor`
			`target_device (torch.device): the target device`
			`Returns:`
			`torch.Tensor: a cloned torch tensor`
			`"""`
			`t_payload = t.payload if isinstance(t, ShardedTensor) else t`

			`ret = t_payload.to(target_device)`
			`return ret`