|
|
|
@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
|
|
|
|
|
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
|
|
|
|
|
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
|
|
|
|
|
Defaults to 'cuda'.
|
|
|
|
|
gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0.
|
|
|
|
|
gradient_predivide_factor (Optional[float], optional): Gradient is divided by this value before reduce-scatter. Defaults to 1.0.
|
|
|
|
|
reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
|
|
|
|
|
Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
|
|
|
|
|
In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
|
|
|
|
@ -205,7 +205,7 @@ class ShardedModelV2(nn.Module):
|
|
|
|
|
exit(0)
|
|
|
|
|
"""
|
|
|
|
|
if self._use_memory_tracer:
|
|
|
|
|
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
|
|
|
|
|
self.logger.error(f'dump memory tracer collected information to a {filename}', ranks=[0])
|
|
|
|
|
if gpc.get_global_rank() == 0:
|
|
|
|
|
with open(filename, 'w+') as f:
|
|
|
|
|
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
|
|
|
|
@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
|
|
|
|
|
# make parameters point to gradient
|
|
|
|
|
|
|
|
|
|
assert param.colo_attr.saved_grad.is_null(
|
|
|
|
|
), 'Gradien accumulation is not supported when reuse_fp16_shard=True'
|
|
|
|
|
), 'Gradient accumulation is not supported when reuse_fp16_shard=True'
|
|
|
|
|
|
|
|
|
|
param.colo_attr.grad_payload_reset(grad.data)
|
|
|
|
|
# release the memory of param
|
|
|
|
|