polish docstring of zero (#612)

pull/615/head
ver217 2022-04-01 14:50:56 +08:00 committed by GitHub
parent 02b187c14f
commit 0ef8819c67
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 21 deletions

View File

@ -31,10 +31,15 @@ from ._utils import (cast_float_arguments, cast_tensor_to_fp16, cast_tensor_to_f
class ShardedModelV2(nn.Module): class ShardedModelV2(nn.Module):
""" """
A wrapper for the PyTorch module shards the model parameters among multiple GPU memory. A wrapper for the PyTorch module shards the model parameters among multiple GPU memory.
Only 1/#nproc of parameters, gradients are stored in local CUDA memory, so forward and backward Only `1/#nproc` of parameters, gradients are stored in local CUDA memory, so forward and backward
passes can be executed with limited CUDA memory budget. passes can be executed with limited CUDA memory budget.
Note that you must use `ShardedModelV2` with `ShardedOptimizerV2`. Note:
You must use ``ShardedModelV2`` with ``ShardedOptimizerV2``.
Note:
Make sure you don't use gradient accumulation and your optimizer can work with fp16 gradient and fp32 parameter,
if you enable ``reuse_fp16_shard``.
Args: Args:
module (nn.Module): A sharded module, which must be initialized by `ZeroInitContext`. module (nn.Module): A sharded module, which must be initialized by `ZeroInitContext`.
@ -145,15 +150,20 @@ class ShardedModelV2(nn.Module):
def cpu_offload(self): def cpu_offload(self):
return self._cpu_offload return self._cpu_offload
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None: def dump_memory_stats(self, filename: str = 'dump_mem_stats.log') -> None:
""" """Dummy memory tracer collected infomation to a file.
dummy memory tracer collected infomation to a file.
try: Example::
# forward: model(inputs)
# backward: optimizer.backward() try:
except Exception as e: # forward: model(inputs)
model.dump_memory_stats() # backward: optimizer.backward()
exit(0) except Exception as e:
model.dump_memory_stats()
exit(0)
Args:
filename (str, optional): Output file name. Defaults to 'dump_mem_stats.log'.
""" """
if self._use_memory_tracer: if self._use_memory_tracer:
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0]) self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])

View File

@ -31,24 +31,28 @@ class OptimState(Enum):
class ShardedOptimizerV2(ColossalaiOptimizer): class ShardedOptimizerV2(ColossalaiOptimizer):
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO). """A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO).
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU. By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
We apply the Device-aware Operator Placement technique for OS placement from the following paper. We apply the Device-aware Operator Placement technique for OS placement from the following paper.
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management `PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_
https://arxiv.org/abs/2108.05818
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory, GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
which is detected by a runtime memory tracer. which is detected by a runtime memory tracer.
We place as many OS chunks in the margin space as possible. We place as many OS chunks in the margin space as possible.
The size of margin space can be controlled by `gpu_margin_mem_ratio` The size of margin space can be controlled by ``gpu_margin_mem_ratio``.
If it is set as 0.0, it is the same as classical ZeRO optimizer. If it is set as ``0.0``, it is the same as classical ZeRO optimizer.
NOTE() You must use `ShardedOptimizerV2` with `ShardedModelV2`. Note:
You must use ``ShardedOptimizerV2`` with ``ShardedModelV2``.
Note:
Make sure you enable ``use_memory_tracer`` in ``ShardedModelV2``,
if you set ``gpu_margin_mem_ratio > 0``.
Args: Args:
sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
@ -56,7 +60,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
optimizer (Optimizer): An Optimizer instance. optimizer (Optimizer): An Optimizer instance.
cpu_offload (bool, optional): Is offloading the optimizer states to CPU.. Defaults to False. cpu_offload (bool, optional): Is offloading the optimizer states to CPU.. Defaults to False.
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
which will be used when using hybrid CPU optimizer. Defaults to 0.0. which will be used when using hybrid CPU optimizer.
Make sure `reuse_fp16_shard` is enabled in `ShardedModelV2`, if `gpu_margin_mem_ratio` > `0.0`.
Defaults to 0.0.
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32. initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1. min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2. growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
@ -66,6 +72,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32. max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None. dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None. mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
https://arxiv.org/abs/2108.05818
""" """
def __init__(self, def __init__(self,
@ -144,9 +153,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self) GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
def get_memory_usage(self) -> Tuple[int, int]: def get_memory_usage(self) -> Tuple[int, int]:
""" """ Get the memory usage of the optimizer. Including master_params (param fp32),
Get the memory usage of the optimizer. Including master_params (param fp32), momentum (``self.state[p]['exp_avg']``) variance (``self.state[p]['exp_avg_sq']``)
momentum (self.state[p]['exp_avg']) variance (self.state[p]['exp_avg_sq'])
Returns: Returns:
Tuple[int, int]: cuda/cpu memory usage in Byte. Tuple[int, int]: cuda/cpu memory usage in Byte.