mirror of https://github.com/hpcaitech/ColossalAI
[zero] optimize grad offload (#539)
* optimize grad offload * polish code * polish codepull/545/head^2
parent
7d81b5b46e
commit
fb841dd5c5
|
@ -11,9 +11,10 @@ from colossalai.engine.ophooks import register_ophooks_recursively
|
||||||
from colossalai.engine.ophooks.zero_hook import ZeroHook
|
from colossalai.engine.ophooks.zero_hook import ZeroHook
|
||||||
from colossalai.engine.paramhooks import BaseParamHookMgr
|
from colossalai.engine.paramhooks import BaseParamHookMgr
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.utils.memory_tracer.model_data_memtracer import GLOBAL_MODEL_DATA_TRACER
|
|
||||||
from colossalai.utils.memory_utils.utils import colo_model_data_move_to_cpu, colo_cuda_memory_capacity, colo_model_tensor_clone
|
|
||||||
from colossalai.utils.memory_tracer.memstats_collector import MemStatsCollector
|
from colossalai.utils.memory_tracer.memstats_collector import MemStatsCollector
|
||||||
|
from colossalai.utils.memory_tracer.model_data_memtracer import \
|
||||||
|
GLOBAL_MODEL_DATA_TRACER
|
||||||
|
from colossalai.utils.memory_utils.utils import (colo_cuda_memory_capacity, colo_model_data_move_to_cpu)
|
||||||
from colossalai.zero.shard_utils import BaseShardStrategy
|
from colossalai.zero.shard_utils import BaseShardStrategy
|
||||||
from colossalai.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
|
from colossalai.zero.sharded_model.reduce_scatter import ReduceScatterBucketer
|
||||||
from torch.distributed import ProcessGroup
|
from torch.distributed import ProcessGroup
|
||||||
|
@ -28,7 +29,7 @@ class ShardedModelV2(nn.Module):
|
||||||
A wrapper for the PyTorch module shards the model parameters among multiple GPU memory.
|
A wrapper for the PyTorch module shards the model parameters among multiple GPU memory.
|
||||||
Only 1/#nproc of parameters, gradients are stored in local CUDA memory, so forward and backward
|
Only 1/#nproc of parameters, gradients are stored in local CUDA memory, so forward and backward
|
||||||
passes can be executed with limited CUDA memory budget.
|
passes can be executed with limited CUDA memory budget.
|
||||||
|
|
||||||
Note that you must use `ShardedModelV2` with `ShardedOptimizerV2`.
|
Note that you must use `ShardedModelV2` with `ShardedOptimizerV2`.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -206,7 +207,7 @@ class ShardedModelV2(nn.Module):
|
||||||
else:
|
else:
|
||||||
grad_payload = cast_tensor_to_fp32(p.col_attr.fp16_grad)
|
grad_payload = cast_tensor_to_fp32(p.col_attr.fp16_grad)
|
||||||
if p.col_attr.offload_grad:
|
if p.col_attr.offload_grad:
|
||||||
grad_payload = colo_model_tensor_clone(grad_payload, torch.device('cpu'))
|
colo_model_data_move_to_cpu(grad_payload)
|
||||||
if p.col_attr.fp32_grad is not None:
|
if p.col_attr.fp32_grad is not None:
|
||||||
assert not self.reuse_fp16_shard, 'Gradien accumulation is not supported when reuse_fp16_shard=True'
|
assert not self.reuse_fp16_shard, 'Gradien accumulation is not supported when reuse_fp16_shard=True'
|
||||||
p.col_attr.fp32_grad.add_(grad_payload.view_as(p.col_attr.fp32_grad))
|
p.col_attr.fp32_grad.add_(grad_payload.view_as(p.col_attr.fp32_grad))
|
||||||
|
|
|
@ -10,14 +10,14 @@ from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.logging import get_dist_logger
|
from colossalai.logging import get_dist_logger
|
||||||
from colossalai.nn.optimizer import ColossalaiOptimizer
|
from colossalai.nn.optimizer import ColossalaiOptimizer
|
||||||
|
from colossalai.utils.memory_utils.utils import (colo_model_tensor_clone, colo_tensor_mem_usage)
|
||||||
from colossalai.zero.sharded_model import ShardedModelV2
|
from colossalai.zero.sharded_model import ShardedModelV2
|
||||||
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
|
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp32
|
||||||
|
from colossalai.zero.sharded_optim._utils import has_inf_or_nan
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.distributed import ProcessGroup
|
from torch.distributed import ProcessGroup
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
from torch.optim import Optimizer
|
from torch.optim import Optimizer
|
||||||
from colossalai.zero.sharded_optim._utils import has_inf_or_nan
|
|
||||||
from colossalai.utils.memory_utils.utils import colo_model_data_tensor_move, colo_tensor_mem_usage
|
|
||||||
|
|
||||||
|
|
||||||
class OptimState(Enum):
|
class OptimState(Enum):
|
||||||
|
@ -29,17 +29,17 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO).
|
"""A wrapper for optimizer. `ShardedOptimizerV2` and `ShardedModelV2` implement Zero Redundancy Optimizer (ZeRO).
|
||||||
|
|
||||||
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
|
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
|
||||||
|
|
||||||
We apply the Device-aware Operator Placement technique for OS placement from the following paper.
|
We apply the Device-aware Operator Placement technique for OS placement from the following paper.
|
||||||
|
|
||||||
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
|
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
|
||||||
https://arxiv.org/abs/2108.05818
|
https://arxiv.org/abs/2108.05818
|
||||||
|
|
||||||
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
|
GPU margin space is the remaining space after removing peak non-model data from the overall GPU memory,
|
||||||
which is detected by a runtime memory tracer.
|
which is detected by a runtime memory tracer.
|
||||||
|
|
||||||
We place as many OS chunks in the margin space as possible.
|
We place as many OS chunks in the margin space as possible.
|
||||||
|
|
||||||
The size of margin space can be controlled by `gpu_margin_mem_ratio`。
|
The size of margin space can be controlled by `gpu_margin_mem_ratio`。
|
||||||
If it is set as 0.0, it is the same as classical ZeRO optimizer.
|
If it is set as 0.0, it is the same as classical ZeRO optimizer.
|
||||||
|
|
||||||
|
@ -204,7 +204,8 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
|
||||||
# Since p.data is fp32 and p.col_attr.sharded_data_tensor is fp16
|
# Since p.data is fp32 and p.col_attr.sharded_data_tensor is fp16
|
||||||
|
|
||||||
# TODO() optimize this line CPU (fp32) -> GPU (fp16)
|
# TODO() optimize this line CPU (fp32) -> GPU (fp16)
|
||||||
colo_model_data_tensor_move(p, p.col_attr.sharded_data_tensor)
|
p.col_attr.sharded_data_tensor.reset_payload(
|
||||||
|
colo_model_tensor_clone(p.half(), torch.cuda.current_device()))
|
||||||
|
|
||||||
if not is_param_sharded:
|
if not is_param_sharded:
|
||||||
# We gather full fp16 param here
|
# We gather full fp16 param here
|
||||||
|
|
Loading…
Reference in New Issue