[gemini] refactor gemini mgr (#1151)

* refactor gemini mgr

* udpate __init__
pull/1164/head
ver217 2022-06-22 11:54:36 +08:00 committed by GitHub
parent f8eec98ff5
commit 54aabb8da4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 17 additions and 15 deletions

View File

@ -1,4 +1,5 @@
from .stateful_tensor_mgr import StatefulTensorMgr
from .tensor_placement_policy import TensorPlacementPolicyFactory
from .gemini_mgr import GeminiManager
__all__ = ['StatefulTensorMgr', 'TensorPlacementPolicyFactory']
__all__ = ['StatefulTensorMgr', 'TensorPlacementPolicyFactory', 'GeminiManager']

View File

@ -1,4 +1,5 @@
import torch
import functools
from .memory_tracer.memstats_collector import MemStatsCollectorV2
from typing import List, Optional, Tuple
from time import time
@ -23,10 +24,12 @@ class GeminiManager:
self._compute_list: List[Tuple[Chunk, ...]] = []
self._compute_idx: int = -1
self._cpu_gpu_move_volume = 0
self._h2d_volume = 0
self._d2h_volume = 0
self._layout_time = 0
self._evict_time = 0
self._warmup = True
self._comp_cuda_demand_time = 0
def pre_iter(self):
if self._mem_stats_collector and self._warmup:
@ -39,9 +42,11 @@ class GeminiManager:
self._mem_stats_collector.finish_collection()
self._warmup = False
self._compute_idx = -1
self._cpu_gpu_move_volume = 0
self._h2d_volume = 0
self._d2h_volume = 0
self._layout_time = 0
self._evict_time = 0
self._comp_cuda_demand_time = 0
def adjust_layout(self, chunks: Tuple[Chunk, ...], group_name: str) -> None:
""" Adjust the layout of statefuil tensor according to the information provided
@ -57,22 +62,19 @@ class GeminiManager:
warmup=self._warmup,
compute_list=self._compute_list,
compute_idx=self._compute_idx)
self._cpu_gpu_move_volume += vol
self._d2h_volume += vol
self._evict_time += evict_time
# move COMPUTE tensors to CUDA
self._cpu_gpu_move_volume += cuda_demand
self._h2d_volume += cuda_demand
@property
def cpu_gpu_move_volume(self):
return self._cpu_gpu_move_volume
# @functools.lru_cache(maxsize=None)
# TODO: test lru
@functools.lru_cache(maxsize=None)
def _get_layout_info(self, compute_idx: int, warmup: bool, chunks: Tuple[Chunk, ...], group_name: str):
start = time()
cuda_demand = 0
for chunk in chunks:
if chunk.device_type == 'cpu' or chunk.is_empty:
cuda_demand += chunk.mem
self._comp_cuda_demand_time += time() - start
can_evict_chunks = []
for chunk in self._chunk_manager.chunk_groups[group_name]:
if not chunk.is_empty and chunk.device_type == 'cuda' and chunk.can_move_device:

View File

@ -102,7 +102,7 @@ class AutoPlacementPolicy(PlacementPolicy):
total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
freed_cuda_model_data = 0
end = time()
if avail_cuda_model_data < cuda_demand:
# Move cuda_demand - avail_cuda_model_data volume of tensors
# to_free_cuda_model_data = cuda_demand - avail_cuda_model_data
@ -111,7 +111,6 @@ class AutoPlacementPolicy(PlacementPolicy):
if not warmup:
to_free_chunks = self._sort_can_evict_chunks(tuple(to_free_chunks), compute_idx, tuple(compute_list))
# print(self._sort_can_evict_chunks.cache_info())
end = time()
for chunk in to_free_chunks:
if freed_cuda_model_data >= to_free_cuda_model_data:
break
@ -121,7 +120,7 @@ class AutoPlacementPolicy(PlacementPolicy):
raise RuntimeError(
f"Adjust layout failed! No enough CUDA memory! Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
)
return freed_cuda_model_data, end - start
return freed_cuda_model_data, time() - start
@staticmethod
@functools.lru_cache(maxsize=None)

View File

@ -225,7 +225,7 @@ class ZeroDDP(ColoDDP):
self.chunk_manager.exec_lazy_release()
self._setup_grads_ptr()
self._logger.debug(
f'layout time: {self.gemini_manager._layout_time}, evict time: {self.gemini_manager._evict_time}, PCIE move vol: {self.gemini_manager._cpu_gpu_move_volume}B'
f'comp cuda demand time: {self.gemini_manager._comp_cuda_demand_time}, layout time: {self.gemini_manager._layout_time}, evict time: {self.gemini_manager._evict_time}, CPU->CUDA vol: {self.gemini_manager._h2d_volume}B, CUDA->CPU vol: {self.gemini_manager._d2h_volume}'
)
self.gemini_manager.post_iter()