Browse Source

[memory] set cuda mem frac (#506)

pull/511/head
Jiarui Fang 3 years ago committed by GitHub
parent
commit
9330be0f3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      colossalai/utils/commons/__init__.py
  2. 5
      colossalai/utils/memory_tracer/async_memtracer.py
  3. 0
      colossalai/utils/memory_utils/bucket_tensor_copy.py
  4. 44
      colossalai/utils/memory_utils/utils.py
  5. 2
      tests/test_utils/test_bucket_tensor_copy.py

3
colossalai/utils/commons/__init__.py

@ -1,3 +0,0 @@
from .bucket_tensor_copy import BucketizedTensorCopy
__all__ = ['BucketizedTensorCopy']

5
colossalai/utils/memory_tracer/async_memtracer.py

@ -29,6 +29,10 @@ class AsyncMemoryMonitor:
An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
at interval of 1/(10**power) sec.
The idea comes from Runtime Memory Tracer of PatrickStar
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
https://arxiv.org/abs/2108.05818
:param power: the power of time interval, defaults to 10
:type power: int
@ -54,6 +58,7 @@ class AsyncMemoryMonitor:
self.keep_measuring = False
current_device = get_current_device()
def _set_cuda_device():
torch.cuda.set_device(current_device)

0
colossalai/utils/commons/bucket_tensor_copy.py → colossalai/utils/memory_utils/bucket_tensor_copy.py

44
colossalai/utils/memory_utils/utils.py

@ -5,12 +5,27 @@ from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DAT
from typing import Union
_GLOBAL_CUDA_MEM_FRACTION = 1.0
def colo_cuda_memory_capacity():
def colo_set_process_memory_fraction(ratio: float) -> None:
"""colo_set_process_memory_fraction
set how much cuda memory used on the gpu belonging to the current process.
Args:
ratio (float): a ratio between 0. ~ 1.
"""
global _GLOBAL_CUDA_MEM_FRACTION
_GLOBAL_CUDA_MEM_FRACTION = ratio
torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_current_device())
def colo_cuda_memory_capacity() -> float:
"""
Get cuda memory capacity of the current cuda.
"""
return torch.cuda.get_device_properties(get_current_device()).total_memory
return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION
def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t: Union[ShardedTensor,
@ -50,10 +65,25 @@ def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t
src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)
def colo_model_data_move_to_cpu(t: torch.Tensor):
assert isinstance(t, torch.Tensor)
if t.device.type == 'cpu':
def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None:
"""colo_model_data_move_to_cpu
move a model data tensor from gpu to cpu
Args:
t (Union[ShardedTensor, torch.Tensor]): _description_
"""
if isinstance(t, ShardedTensor):
t_payload = t.payload
elif isinstance(t, torch.Tensor):
t_payload = t
else:
raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
if t_payload.device.type == 'cpu':
return
GLOBAL_MODEL_DATA_TRACER.delete_tensor(t)
t.data = t.data.cpu()
# TODO() optimize the tensor moving with non-blocking
GLOBAL_MODEL_DATA_TRACER.delete_tensor(t_payload)
t_payload.data = t_payload.data.cpu()

2
tests/test_utils/test_bucket_tensor_copy.py

@ -1,4 +1,4 @@
from colossalai.utils.commons import BucketizedTensorCopy
from colossalai.utils.memory_utils.bucket_tensor_copy import BucketizedTensorCopy
from colossalai.zero.sharded_param import ShardedParamV2
from colossalai.utils import free_port
import torch

Loading…
Cancel
Save