[memory] set cuda mem frac (#506)

3 years ago · 9330be0f3c
5 changed files with 43 additions and 11 deletions
--- a/colossalai/utils/commons/init.py
+++ b/colossalai/utils/commons/init.py
@ -1,3 +0,0 @@
-from .bucket_tensor_copy import BucketizedTensorCopy
-
-__all__ = ['BucketizedTensorCopy']
--- a/colossalai/utils/memory_tracer/async_memtracer.py
+++ b/colossalai/utils/memory_tracer/async_memtracer.py
@ -29,6 +29,10 @@ class AsyncMemoryMonitor:
    An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
    at interval of 1/(10**power) sec.

+    The idea comes from Runtime Memory Tracer of PatrickStar
+    PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
+    https://arxiv.org/abs/2108.05818
+    
    :param power: the power of time interval, defaults to 10
    :type power: int

@ -54,6 +58,7 @@ class AsyncMemoryMonitor:
        self.keep_measuring = False

        current_device = get_current_device()
+
        def _set_cuda_device():
            torch.cuda.set_device(current_device)

--- a/colossalai/utils/memory_utils/bucket_tensor_copy.py
+++ b/colossalai/utils/memory_utils/bucket_tensor_copy.py
--- a/colossalai/utils/memory_utils/utils.py
+++ b/colossalai/utils/memory_utils/utils.py
@ -5,12 +5,27 @@ from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DAT

 from typing import Union

+_GLOBAL_CUDA_MEM_FRACTION = 1.0

-def colo_cuda_memory_capacity():
+
+def colo_set_process_memory_fraction(ratio: float) -> None:
+    """colo_set_process_memory_fraction 
+
+    set how much cuda memory used on the gpu belonging to the current process.
+
+    Args:
+        ratio (float): a ratio between 0. ~ 1.
+    """
+    global _GLOBAL_CUDA_MEM_FRACTION
+    _GLOBAL_CUDA_MEM_FRACTION = ratio
+    torch.cuda.set_per_process_memory_fraction(_GLOBAL_CUDA_MEM_FRACTION, get_current_device())
+
+
+def colo_cuda_memory_capacity() -> float:
    """
    Get cuda memory capacity of the current cuda.
    """
-    return torch.cuda.get_device_properties(get_current_device()).total_memory
+    return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION


 def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t: Union[ShardedTensor,
@ -50,10 +65,25 @@ def colo_model_data_tensor_move(src_t: Union[ShardedTensor, torch.Tensor], tgt_t
        src_t.data = torch.tensor([], device=src_dev, dtype=src_t_payload.dtype)


-def colo_model_data_move_to_cpu(t: torch.Tensor):
-    assert isinstance(t, torch.Tensor)
-    if t.device.type == 'cpu':
+def colo_model_data_move_to_cpu(t: Union[ShardedTensor, torch.Tensor]) -> None:
+    """colo_model_data_move_to_cpu 
+
+    move a model data tensor from gpu to cpu
+
+    Args:
+        t (Union[ShardedTensor, torch.Tensor]): _description_
+    """
+
+    if isinstance(t, ShardedTensor):
+        t_payload = t.payload
+    elif isinstance(t, torch.Tensor):
+        t_payload = t
+    else:
+        raise TypeError('colo_model_data_move_to_cpu dose not accept type {type(t)}')
+
+    if t_payload.device.type == 'cpu':
        return

-    GLOBAL_MODEL_DATA_TRACER.delete_tensor(t)
-    t.data = t.data.cpu()
+    # TODO() optimize the tensor moving with non-blocking
+    GLOBAL_MODEL_DATA_TRACER.delete_tensor(t_payload)
+    t_payload.data = t_payload.data.cpu()
--- a/tests/test_utils/test_bucket_tensor_copy.py
+++ b/tests/test_utils/test_bucket_tensor_copy.py
@ -1,4 +1,4 @@
-from colossalai.utils.commons import BucketizedTensorCopy
+from colossalai.utils.memory_utils.bucket_tensor_copy import BucketizedTensorCopy
 from colossalai.zero.sharded_param import ShardedParamV2
 from colossalai.utils import free_port
 import torch