mirror of https://github.com/hpcaitech/ColossalAI
polish docstring of zero (#612)
parent
02b187c14f
commit
0ef8819c67
|
@ -31,10 +31,15 @@ from ._utils import (cast_float_arguments, cast_tensor_to_fp16, cast_tensor_to_f
|
|||
class ShardedModelV2(nn.Module):
|
||||
"""
|
||||
A wrapper for the PyTorch module shards the model parameters among multiple GPU memory.
|
||||
Only 1/#nproc of parameters, gradients are stored in local CUDA memory, so forward and backward
|
||||
Only `1/#nproc` of parameters, gradients are stored in local CUDA memory, so forward and backward
|
||||
passes can be executed with limited CUDA memory budget.
|
||||
|
||||
Note that you must use `ShardedModelV2` with `ShardedOptimizerV2`.
|
||||
Note:
|
||||
You must use ``ShardedModelV2`` with ``ShardedOptimizerV2``.
|
||||
|
||||
Note:
|
||||
Make sure you don't use gradient accumulation and your optimizer can work with fp16 gradient and fp32 parameter,
|
||||
if you enable ``reuse_fp16_shard``.
|
||||
|
||||
Args:
|
||||
module (nn.Module): A sharded module, which must be initialized by `ZeroInitContext`.
|
||||
|
@ -145,15 +150,20 @@ class ShardedModelV2(nn.Module):
|
|||
def cpu_offload(self):
|
||||
return self._cpu_offload
|
||||
|
||||
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
|
||||
"""
|
||||
dummy memory tracer collected infomation to a file.
|
||||
try:
|
||||
# forward: model(inputs)
|
||||
# backward: optimizer.backward()
|
||||
except Exception as e:
|
||||
model.dump_memory_stats()
|
||||
exit(0)
|
||||
def dump_memory_stats(self, filename: str = 'dump_mem_stats.log') -> None:
|
||||
"""Dummy memory tracer collected infomation to a file.
|
||||
|
||||
Example::
|
||||
|
||||
try:
|
||||
# forward: model(inputs)
|
||||
# backward: optimizer.backward()
|
||||
except Exception as e:
|
||||
model.dump_memory_stats()
|
||||
exit(0)
|
||||
|
||||
Args:
|
||||
filename (str, optional): Output file name. Defaults to 'dump_mem_stats.log'.
|
||||
"""
|
||||
if self._use_memory_tracer:
|
||||
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
|
||||
|
|
|
@ -31,24 +31,28 @@ class OptimState(Enum):
|
|||
|
||||
|
||||
class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO).
|
||||
"""A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO).
|
||||
|
||||
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
|
||||
|
||||
We apply the Device-aware Operator Placement technique for OS placement from the following paper.
|
||||
|
||||
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
|
||||
https://arxiv.org/abs/2108.05818
|
||||
`PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_
|
||||
|
||||
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
|
||||
which is detected by a runtime memory tracer.
|
||||
|
||||
We place as many OS chunks in the margin space as possible.
|
||||
|
||||
The size of margin space can be controlled by `gpu_margin_mem_ratio`。
|
||||
If it is set as 0.0, it is the same as classical ZeRO optimizer.
|
||||
The size of margin space can be controlled by ``gpu_margin_mem_ratio``.
|
||||
If it is set as ``0.0``, it is the same as classical ZeRO optimizer.
|
||||
|
||||
NOTE() You must use `ShardedOptimizerV2` with `ShardedModelV2`.
|
||||
Note:
|
||||
You must use ``ShardedOptimizerV2`` with ``ShardedModelV2``.
|
||||
|
||||
Note:
|
||||
Make sure you enable ``use_memory_tracer`` in ``ShardedModelV2``,
|
||||
if you set ``gpu_margin_mem_ratio > 0``.
|
||||
|
||||
Args:
|
||||
sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
|
||||
|
@ -56,7 +60,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
|||
optimizer (Optimizer): An Optimizer instance.
|
||||
cpu_offload (bool, optional): Is offloading the optimizer states to CPU.. Defaults to False.
|
||||
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
||||
which will be used when using hybrid CPU optimizer. Defaults to 0.0.
|
||||
which will be used when using hybrid CPU optimizer.
|
||||
Make sure `reuse_fp16_shard` is enabled in `ShardedModelV2`, if `gpu_margin_mem_ratio` > `0.0`.
|
||||
Defaults to 0.0.
|
||||
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
|
||||
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
|
||||
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
|
||||
|
@ -66,6 +72,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
|||
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
|
||||
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
|
||||
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
|
||||
|
||||
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
|
||||
https://arxiv.org/abs/2108.05818
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -144,9 +153,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
|||
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
|
||||
|
||||
def get_memory_usage(self) -> Tuple[int, int]:
|
||||
"""
|
||||
Get the memory usage of the optimizer. Including master_params (param fp32),
|
||||
momentum (self.state[p]['exp_avg']) variance (self.state[p]['exp_avg_sq'])
|
||||
""" Get the memory usage of the optimizer. Including master_params (param fp32),
|
||||
momentum (``self.state[p]['exp_avg']``) variance (``self.state[p]['exp_avg_sq']``)
|
||||
|
||||
Returns:
|
||||
Tuple[int, int]: cuda/cpu memory usage in Byte.
|
||||
|
|
Loading…
Reference in New Issue