mirror of https://github.com/hpcaitech/ColossalAI
[memory] set cuda mem frac (#506)
parent
97933b6710
commit
9330be0f3c
|
@ -1,3 +0,0 @@
|
|||
from .bucket_tensor_copy import BucketizedTensorCopy
|
||||
|
||||
__all__ = ['BucketizedTensorCopy']
|
|
@ -29,6 +29,10 @@ class AsyncMemoryMonitor:
|
|||
An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
|
||||
at interval of 1/(10**power) sec.
|
||||
|
||||
The idea comes from Runtime Memory Tracer of PatrickStar
|
||||
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
|
||||
https://arxiv.org/abs/2108.05818
|
||||
|
||||
:param power: the power of time interval, defaults to 10
|
||||
:type power: int
|
||||
|
||||
|
@ -54,6 +58,7 @@ class AsyncMemoryMonitor:
|
|||
self.keep_measuring = False
|
||||
|
||||
current_device = get_current_device()
|
||||
|
||||
def _set_cuda_device():
|
||||
torch.cuda.set_device(current_device)
|
||||
|
||||
|
|
|
@ -5,12 +5,27 @@ from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DAT
|
|||
|
||||
from typing import Union
|
||||
|
||||
_GLOBAL_CUDA_MEM_FRACTION = 1.0
|
||||
|
||||
def colo_cuda_memory_capacity():
|
||||
|
||||
def colo_set_process_memory_fraction(ratio: float) -> None:
|
||||
"""colo_set_process_memory_fraction
|
||||
|
||||
set how much cuda memory used on the gpu belonging to the current process.
|
||||
|
||||
Args:
|
||||
ratio (float): a ratio between 0. ~ 1.
|
||||
"""
|
||||
global _GLOBAL_CUDA_MEM_FRACTION
|
||||
_GLOBAL_CUDA_MEM_FRACTION = ratio
|
||||
torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_current_device())
|
||||
|
||||
|
||||
def colo_cuda_memory_capacity() -> float:
|
||||
"""
|
||||
Get cuda memory capacity of the current cuda.
|
||||
"""
|
||||
return torch.cuda.get_device_properties(get_current_device()).total_memory
|
||||
return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION
|
||||
|
||||
|
||||
def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t: Union[ShardedTensor,
|
||||
|
@ -50,10 +65,25 @@ def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t
|
|||
src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)
|
||||
|
||||
|
||||
def colo_model_data_move_to_cpu(t: torch.Tensor):
|
||||
assert isinstance(t, torch.Tensor)
|
||||
if t.device.type == 'cpu':
|
||||
def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None:
|
||||
"""colo_model_data_move_to_cpu
|
||||
|
||||
move a model data tensor from gpu to cpu
|
||||
|
||||
Args:
|
||||
t (Union[ShardedTensor, torch.Tensor]): _description_
|
||||
"""
|
||||
|
||||
if isinstance(t, ShardedTensor):
|
||||
t_payload = t.payload
|
||||
elif isinstance(t, torch.Tensor):
|
||||
t_payload = t
|
||||
else:
|
||||
raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
|
||||
|
||||
if t_payload.device.type == 'cpu':
|
||||
return
|
||||
|
||||
GLOBAL_MODEL_DATA_TRACER.delete_tensor(t)
|
||||
t.data = t.data.cpu()
|
||||
# TODO() optimize the tensor moving with non-blocking
|
||||
GLOBAL_MODEL_DATA_TRACER.delete_tensor(t_payload)
|
||||
t_payload.data = t_payload.data.cpu()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from colossalai.utils.commons import BucketizedTensorCopy
|
||||
from colossalai.utils.memory_utils.bucket_tensor_copy import BucketizedTensorCopy
|
||||
from colossalai.zero.sharded_param import ShardedParamV2
|
||||
from colossalai.utils import free_port
|
||||
import torch
|
||||
|
|
Loading…
Reference in New Issue