mirror of https://github.com/hpcaitech/ColossalAI
[zero] dump memory stats for sharded model (#548)
parent
763dc325f1
commit
107b99ddb1
|
@ -45,10 +45,18 @@ class MemStatsCollector:
|
||||||
def overall_cuda(self):
|
def overall_cuda(self):
|
||||||
return self._overall_cuda
|
return self._overall_cuda
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_data_cuda_GB(self):
|
||||||
|
return [elem / 1e9 for elem in self._model_data_cuda]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_data_cuda(self):
|
def model_data_cuda(self):
|
||||||
return self._model_data_cuda
|
return self._model_data_cuda
|
||||||
|
|
||||||
|
@property
|
||||||
|
def non_model_data_cuda_GB(self):
|
||||||
|
return [elem / 1e9 for elem in self.non_model_data_cuda]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def non_model_data_cuda(self):
|
def non_model_data_cuda(self):
|
||||||
"""Non model data stats
|
"""Non model data stats
|
||||||
|
|
|
@ -11,6 +11,9 @@ from colossalai.engine.ophooks import register_ophooks_recursively
|
||||||
from colossalai.engine.ophooks.zero_hook import ZeroHook
|
from colossalai.engine.ophooks.zero_hook import ZeroHook
|
||||||
from colossalai.engine.paramhooks import BaseParamHookMgr
|
from colossalai.engine.paramhooks import BaseParamHookMgr
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
|
from colossalai.utils import get_current_device
|
||||||
|
from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
|
||||||
|
from colossalai.utils.memory_utils.utils import colo_cuda_memory_capacity
|
||||||
from colossalai.utils.memory_tracer.memstats_collector import MemStatsCollector
|
from colossalai.utils.memory_tracer.memstats_collector import MemStatsCollector
|
||||||
from colossalai.utils.memory_tracer.model_data_memtracer import \
|
from colossalai.utils.memory_tracer.model_data_memtracer import \
|
||||||
GLOBAL_MODEL_DATA_TRACER
|
GLOBAL_MODEL_DATA_TRACER
|
||||||
|
@ -131,6 +134,29 @@ class ShardedModelV2(nn.Module):
|
||||||
def cpu_offload(self):
|
def cpu_offload(self):
|
||||||
return self._cpu_offload
|
return self._cpu_offload
|
||||||
|
|
||||||
|
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
|
||||||
|
"""
|
||||||
|
dummy memory tracer collected infomation to a file.
|
||||||
|
try:
|
||||||
|
# forward: model(inputs)
|
||||||
|
# backward: optimizer.backward()
|
||||||
|
except Exception as e:
|
||||||
|
model.dump_memory_stats()
|
||||||
|
exit(0)
|
||||||
|
"""
|
||||||
|
if self._use_memory_tracer:
|
||||||
|
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
|
||||||
|
if gpc.get_global_rank() == 0:
|
||||||
|
with open(filename, 'w+') as f:
|
||||||
|
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device())/1e9} GB\n')
|
||||||
|
f.write(f'cuda max allocated {torch.cuda.max_memory_allocated(get_current_device())/1e9} GB\n')
|
||||||
|
f.write('model data\n')
|
||||||
|
f.write(str(self._memstats_collector.model_data_cuda_GB))
|
||||||
|
f.write('\n')
|
||||||
|
f.write('non model data\n')
|
||||||
|
f.write(str(self._memstats_collector.non_model_data_cuda_GB))
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
|
def forward(self, *args: Any, **kwargs: Any) -> torch.Tensor:
|
||||||
if self._iter_cnter == 0 and self._memstats_collector:
|
if self._iter_cnter == 0 and self._memstats_collector:
|
||||||
# the opeartion will affect the flag in ZeroHook
|
# the opeartion will affect the flag in ZeroHook
|
||||||
|
@ -147,6 +173,7 @@ class ShardedModelV2(nn.Module):
|
||||||
|
|
||||||
def backward_by_grad(self, tensor, grad):
|
def backward_by_grad(self, tensor, grad):
|
||||||
torch.autograd.backward(tensors=tensor, grad_tensors=grad)
|
torch.autograd.backward(tensors=tensor, grad_tensors=grad)
|
||||||
|
|
||||||
self._post_backward_operations()
|
self._post_backward_operations()
|
||||||
for ophook in self._ophook_list:
|
for ophook in self._ophook_list:
|
||||||
ophook.post_iter()
|
ophook.post_iter()
|
||||||
|
@ -154,9 +181,6 @@ class ShardedModelV2(nn.Module):
|
||||||
def _update_memstats(self):
|
def _update_memstats(self):
|
||||||
if self._iter_cnter == 0 and self._memstats_collector:
|
if self._iter_cnter == 0 and self._memstats_collector:
|
||||||
self._memstats_collector.finish_collection()
|
self._memstats_collector.finish_collection()
|
||||||
self.logger.debug(f'model data cuda, {self._memstats_collector.model_data_cuda}')
|
|
||||||
self.logger.debug(f'non-model data cuda, {self._memstats_collector.non_model_data_cuda}')
|
|
||||||
|
|
||||||
if self._memstats_collector:
|
if self._memstats_collector:
|
||||||
self._memstats_collector.reset_sampling_cnter()
|
self._memstats_collector.reset_sampling_cnter()
|
||||||
# cuda margin space = cuda mem capacity - max fwd/bwd cuda mem used.
|
# cuda margin space = cuda mem capacity - max fwd/bwd cuda mem used.
|
||||||
|
|
|
@ -24,7 +24,6 @@ from colossalai.utils.memory_utils.utils import colo_model_data_tensor_move, col
|
||||||
from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
|
from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class OptimState(Enum):
|
class OptimState(Enum):
|
||||||
SCALED = 1
|
SCALED = 1
|
||||||
UNSCALED = 2
|
UNSCALED = 2
|
||||||
|
@ -139,6 +138,10 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
if self._use_memory_tracer:
|
if self._use_memory_tracer:
|
||||||
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
|
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
|
||||||
|
|
||||||
|
self._use_memory_tracer = self.model.use_memory_tracer
|
||||||
|
if self._use_memory_tracer:
|
||||||
|
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
|
||||||
|
|
||||||
def get_memory_usage(self) -> Tuple[int, int]:
|
def get_memory_usage(self) -> Tuple[int, int]:
|
||||||
"""
|
"""
|
||||||
Get the memory usage of the optimizer. Including master_params (param fp32),
|
Get the memory usage of the optimizer. Including master_params (param fp32),
|
||||||
|
@ -190,13 +193,15 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
# Now p.data is sharded
|
# Now p.data is sharded
|
||||||
# So optimizer states are sharded naturally
|
# So optimizer states are sharded naturally
|
||||||
|
|
||||||
self._logger.debug(f"Before step ShardedOptimizerV2 consumes {self.get_memory_usage()[0]/1e6} MB CUDA Memory!",
|
self._logger.debug(
|
||||||
ranks=[0])
|
f"Before step ShardedOptimizerV2 consumes {self.get_memory_usage()[0]/1e6} MB CUDA Memory, {self.get_memory_usage()[1]/1e6} MB CUDA Memory!",
|
||||||
|
ranks=[0])
|
||||||
|
|
||||||
ret = self.optim.step(*args, **kwargs)
|
ret = self.optim.step(*args, **kwargs)
|
||||||
|
|
||||||
self._logger.debug(f"After step ShardedOptimizerV2 consumes {self.get_memory_usage()[0]/1e6} MB CUDA Memory!",
|
self._logger.debug(
|
||||||
ranks=[0])
|
f"After step ShardedOptimizerV2 consumes {self.get_memory_usage()[0]/1e6} MB CUDA Memory, {self.get_memory_usage()[1]/1e6} MB CUDA Memory!",
|
||||||
|
ranks=[0])
|
||||||
# Copy master param data (fp32) to payload of col_attr (fp16)
|
# Copy master param data (fp32) to payload of col_attr (fp16)
|
||||||
# TODO() improve efficiency by gathering tensors into a chunk and transfering
|
# TODO() improve efficiency by gathering tensors into a chunk and transfering
|
||||||
# a chunk.
|
# a chunk.
|
||||||
|
|
Loading…
Reference in New Issue