From 9330be0f3c0a6833b1782838e8fad765bca9fc07 Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Thu, 24 Mar 2022 16:57:13 +0800 Subject: [PATCH] [memory] set cuda mem frac (#506) --- colossalai/utils/commons/__init__.py | 3 -- .../utils/memory_tracer/async_memtracer.py | 5 +++ .../bucket_tensor_copy.py | 0 colossalai/utils/memory_utils/utils.py | 44 ++++++++++++++++--- tests/test_utils/test_bucket_tensor_copy.py | 2 +- 5 files changed, 43 insertions(+), 11 deletions(-) delete mode 100644 colossalai/utils/commons/__init__.py rename colossalai/utils/{commons => memory_utils}/bucket_tensor_copy.py (100%) diff --git a/colossalai/utils/commons/__init__.py b/colossalai/utils/commons/__init__.py deleted file mode 100644 index e48fad25c..000000000 --- a/colossalai/utils/commons/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .bucket_tensor_copy import BucketizedTensorCopy - -__all__ = ['BucketizedTensorCopy'] diff --git a/colossalai/utils/memory_tracer/async_memtracer.py b/colossalai/utils/memory_tracer/async_memtracer.py index 842aafbdd..4091f94aa 100644 --- a/colossalai/utils/memory_tracer/async_memtracer.py +++ b/colossalai/utils/memory_tracer/async_memtracer.py @@ -29,6 +29,10 @@ class AsyncMemoryMonitor: An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU at interval of 1/(10**power) sec. + The idea comes from Runtime Memory Tracer of PatrickStar + PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management + https://arxiv.org/abs/2108.05818 + :param power: the power of time interval, defaults to 10 :type power: int @@ -54,6 +58,7 @@ class AsyncMemoryMonitor: self.keep_measuring = False current_device = get_current_device() + def _set_cuda_device(): torch.cuda.set_device(current_device) diff --git a/colossalai/utils/commons/bucket_tensor_copy.py b/colossalai/utils/memory_utils/bucket_tensor_copy.py similarity index 100% rename from colossalai/utils/commons/bucket_tensor_copy.py rename to colossalai/utils/memory_utils/bucket_tensor_copy.py diff --git a/colossalai/utils/memory_utils/utils.py b/colossalai/utils/memory_utils/utils.py index b049a92ca..b1c24994c 100644 --- a/colossalai/utils/memory_utils/utils.py +++ b/colossalai/utils/memory_utils/utils.py @@ -5,12 +5,27 @@ from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DAT from typing import Union +_GLOBAL_CUDA_MEM_FRACTION = 1.0 -def colo_cuda_memory_capacity(): + +def colo_set_process_memory_fraction(ratio: float) -> None: + """colo_set_process_memory_fraction + + set how much cuda memory used on the gpu belonging to the current process. + + Args: + ratio (float): a ratio between 0. ~ 1. + """ + global _GLOBAL_CUDA_MEM_FRACTION + _GLOBAL_CUDA_MEM_FRACTION = ratio + torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_current_device()) + + +def colo_cuda_memory_capacity() -> float: """ Get cuda memory capacity of the current cuda. """ - return torch.cuda.get_device_properties(get_current_device()).total_memory + return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t: Union[ShardedTensor, @@ -50,10 +65,25 @@ def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype) -def colo_model_data_move_to_cpu(t: torch.Tensor): - assert isinstance(t, torch.Tensor) - if t.device.type == 'cpu': +def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None: + """colo_model_data_move_to_cpu + + move a model data tensor from gpu to cpu + + Args: + t (Union[ShardedTensor, torch.Tensor]): _description_ + """ + + if isinstance(t, ShardedTensor): + t_payload = t.payload + elif isinstance(t, torch.Tensor): + t_payload = t + else: + raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}') + + if t_payload.device.type == 'cpu': return - GLOBAL_MODEL_DATA_TRACER.delete_tensor(t) - t.data = t.data.cpu() + # TODO() optimize the tensor moving with non-blocking + GLOBAL_MODEL_DATA_TRACER.delete_tensor(t_payload) + t_payload.data = t_payload.data.cpu() diff --git a/tests/test_utils/test_bucket_tensor_copy.py b/tests/test_utils/test_bucket_tensor_copy.py index 31d534b78..f190cb522 100644 --- a/tests/test_utils/test_bucket_tensor_copy.py +++ b/tests/test_utils/test_bucket_tensor_copy.py @@ -1,4 +1,4 @@ -from colossalai.utils.commons import BucketizedTensorCopy +from colossalai.utils.memory_utils.bucket_tensor_copy import BucketizedTensorCopy from colossalai.zero.sharded_param import ShardedParamV2 from colossalai.utils import free_port import torch