mirror of https://github.com/hpcaitech/ColossalAI
polish docstring of zero (#612)
parent
02b187c14f
commit
0ef8819c67
|
@ -31,10 +31,15 @@ from ._utils import (cast_float_arguments, cast_tensor_to_fp16, cast_tensor_to_f
|
||||||
class ShardedModelV2(nn.Module):
|
class ShardedModelV2(nn.Module):
|
||||||
"""
|
"""
|
||||||
A wrapper for the PyTorch module shards the model parameters among multiple GPU memory.
|
A wrapper for the PyTorch module shards the model parameters among multiple GPU memory.
|
||||||
Only 1/#nproc of parameters, gradients are stored in local CUDA memory, so forward and backward
|
Only `1/#nproc` of parameters, gradients are stored in local CUDA memory, so forward and backward
|
||||||
passes can be executed with limited CUDA memory budget.
|
passes can be executed with limited CUDA memory budget.
|
||||||
|
|
||||||
Note that you must use `ShardedModelV2` with `ShardedOptimizerV2`.
|
Note:
|
||||||
|
You must use ``ShardedModelV2`` with ``ShardedOptimizerV2``.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Make sure you don't use gradient accumulation and your optimizer can work with fp16 gradient and fp32 parameter,
|
||||||
|
if you enable ``reuse_fp16_shard``.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
module (nn.Module): A sharded module, which must be initialized by `ZeroInitContext`.
|
module (nn.Module): A sharded module, which must be initialized by `ZeroInitContext`.
|
||||||
|
@ -145,15 +150,20 @@ class ShardedModelV2(nn.Module):
|
||||||
def cpu_offload(self):
|
def cpu_offload(self):
|
||||||
return self._cpu_offload
|
return self._cpu_offload
|
||||||
|
|
||||||
def dump_memory_stats(self, filename: Optional[str] = 'dump_mem_stats.log') -> None:
|
def dump_memory_stats(self, filename: str = 'dump_mem_stats.log') -> None:
|
||||||
"""
|
"""Dummy memory tracer collected infomation to a file.
|
||||||
dummy memory tracer collected infomation to a file.
|
|
||||||
try:
|
Example::
|
||||||
# forward: model(inputs)
|
|
||||||
# backward: optimizer.backward()
|
try:
|
||||||
except Exception as e:
|
# forward: model(inputs)
|
||||||
model.dump_memory_stats()
|
# backward: optimizer.backward()
|
||||||
exit(0)
|
except Exception as e:
|
||||||
|
model.dump_memory_stats()
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename (str, optional): Output file name. Defaults to 'dump_mem_stats.log'.
|
||||||
"""
|
"""
|
||||||
if self._use_memory_tracer:
|
if self._use_memory_tracer:
|
||||||
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
|
self.logger.error(f'dump memort tracer collected infomation to a {filename}', ranks=[0])
|
||||||
|
|
|
@ -31,24 +31,28 @@ class OptimState(Enum):
|
||||||
|
|
||||||
|
|
||||||
class ShardedOptimizerV2(ColossalaiOptimizer):
|
class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO).
|
"""A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO).
|
||||||
|
|
||||||
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
|
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
|
||||||
|
|
||||||
We apply the Device-aware Operator Placement technique for OS placement from the following paper.
|
We apply the Device-aware Operator Placement technique for OS placement from the following paper.
|
||||||
|
|
||||||
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
|
`PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_
|
||||||
https://arxiv.org/abs/2108.05818
|
|
||||||
|
|
||||||
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
|
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
|
||||||
which is detected by a runtime memory tracer.
|
which is detected by a runtime memory tracer.
|
||||||
|
|
||||||
We place as many OS chunks in the margin space as possible.
|
We place as many OS chunks in the margin space as possible.
|
||||||
|
|
||||||
The size of margin space can be controlled by `gpu_margin_mem_ratio`。
|
The size of margin space can be controlled by ``gpu_margin_mem_ratio``.
|
||||||
If it is set as 0.0, it is the same as classical ZeRO optimizer.
|
If it is set as ``0.0``, it is the same as classical ZeRO optimizer.
|
||||||
|
|
||||||
NOTE() You must use `ShardedOptimizerV2` with `ShardedModelV2`.
|
Note:
|
||||||
|
You must use ``ShardedOptimizerV2`` with ``ShardedModelV2``.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Make sure you enable ``use_memory_tracer`` in ``ShardedModelV2``,
|
||||||
|
if you set ``gpu_margin_mem_ratio > 0``.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
|
sharded_model (ShardedModelV2): A sharded model initialized by class ShardedModelV2. The optimizer will use the
|
||||||
|
@ -56,7 +60,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
optimizer (Optimizer): An Optimizer instance.
|
optimizer (Optimizer): An Optimizer instance.
|
||||||
cpu_offload (bool, optional): Is offloading the optimizer states to CPU.. Defaults to False.
|
cpu_offload (bool, optional): Is offloading the optimizer states to CPU.. Defaults to False.
|
||||||
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
||||||
which will be used when using hybrid CPU optimizer. Defaults to 0.0.
|
which will be used when using hybrid CPU optimizer.
|
||||||
|
Make sure `reuse_fp16_shard` is enabled in `ShardedModelV2`, if `gpu_margin_mem_ratio` > `0.0`.
|
||||||
|
Defaults to 0.0.
|
||||||
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
|
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
|
||||||
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
|
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
|
||||||
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
|
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
|
||||||
|
@ -66,6 +72,9 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
|
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
|
||||||
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
|
dp_process_group (Optional[ProcessGroup], optional): data paralle process group. Defaults to None.
|
||||||
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
|
mp_process_group (Optional[ProcessGroup], optional): model paralle process group. Defaults to None.
|
||||||
|
|
||||||
|
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
|
||||||
|
https://arxiv.org/abs/2108.05818
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
@ -144,9 +153,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
|
GLOBAL_MODEL_DATA_TRACER.register_optimizer(self)
|
||||||
|
|
||||||
def get_memory_usage(self) -> Tuple[int, int]:
|
def get_memory_usage(self) -> Tuple[int, int]:
|
||||||
"""
|
""" Get the memory usage of the optimizer. Including master_params (param fp32),
|
||||||
Get the memory usage of the optimizer. Including master_params (param fp32),
|
momentum (``self.state[p]['exp_avg']``) variance (``self.state[p]['exp_avg_sq']``)
|
||||||
momentum (self.state[p]['exp_avg']) variance (self.state[p]['exp_avg_sq'])
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[int, int]: cuda/cpu memory usage in Byte.
|
Tuple[int, int]: cuda/cpu memory usage in Byte.
|
||||||
|
|
Loading…
Reference in New Issue