from typing import Any, Optional import torch from colossalai.kernel.op_builder import FusedOptimBuilder from colossalai.utils import multi_tensor_applier from .cpu_adam import CPUAdam class HybridAdam(CPUAdam): """Implements Adam algorithm. Supports parameters updating on both GPU and CPU, depending on the device of parameters. But the parameters and gradients should on the same device: * Parameters on CPU and gradients on CPU is allowed. * Parameters on GPU and gradients on GPU is allowed. * Parameters on GPU and gradients on CPU is **not** allowed. `HybridAdam` requires CUDA extensions which can be built during installation or runtime. This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam. * For parameters updating on CPU, it uses CPUAdam. * For parameters updating on GPU, it uses FusedAdam. * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients. :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``, or ``torch.optim.Adam`` with ``adamw_mode=False`` Adam was been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: model_params (iterable): iterable of parameters of dicts defining parameter groups. lr (float, optional): learning rate. (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square. (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability. (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) NOT SUPPORTED yet in CPUAdam! adamw_mode (boolean, optional): Apply L2 regularization or weight decay True for decoupled weight decay(also known as AdamW) (default: True) simd_log (boolean, optional): whether to show if you are using SIMD to accelerate. (default: False) nvme_offload_fraction (float, optional): Fraction of optimizer states to be offloaded to NVMe. Defaults to 0.0. nvme_offload_dir (Optional[str], optional): Directory to save NVMe offload files. If it's ``None``, a random temporary directory will be used. Defaults to None. .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ # Number of fp32 shards for per parameter # Param weight, grad, momentum and variance num_fp32_shards_per_param = 4 def __init__( self, model_params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, adamw_mode=True, nvme_offload_fraction: float = 0.0, nvme_offload_dir: Optional[str] = None, **defaults: Any, ): super().__init__( model_params, lr, bias_correction, betas, eps, weight_decay, adamw_mode, nvme_offload_fraction, nvme_offload_dir, ) if torch.cuda.is_available(): fused_optim = FusedOptimBuilder().load() self.gpu_adam_op = fused_optim.multi_tensor_adam self._dummy_overflow_buf = torch.cuda.IntTensor([0]) @torch.no_grad() def step(self, closure=None, div_scale: float = -1): loss = None if closure is not None: with torch.enable_grad(): loss = closure() self._pre_step("exp_avg", "exp_avg_sq") for _, group in enumerate(self.param_groups): g_l, p_l, m_l, v_l = [], [], [], [] group_step = 0 for _, p in enumerate(group["params"]): if p.grad is None: continue state = self.state[p] target_device = p.device if len(state) == 0: state["step"] = 0 # gradient momentums state["exp_avg"] = torch.zeros_like(p, device=target_device) # gradient variances state["exp_avg_sq"] = torch.zeros_like(p, device=target_device) self._post_state_init(p) state["step"] += 1 group_step = state["step"] beta1, beta2 = group["betas"] if target_device.type == "cpu" or target_device.type == "npu": assert state["exp_avg"].device.type in ("cpu", "npu"), "exp_avg should stay on cpu" assert state["exp_avg_sq"].device.type in ("cpu", "npu"), "exp_avg should stay on cpu" self._pre_update(p, "exp_avg", "exp_avg_sq") if p.grad.dtype is torch.bfloat16 or p.grad.device.type == "npu": # cpu adam kernel does not support bf16 now bias_correction1 = 1 - beta1 ** state["step"] bias_correction2 = 1 - beta2 ** state["step"] self.torch_adam_update( p.data, p.grad.data, state["exp_avg"], state["exp_avg_sq"], group["lr"], beta1, beta2, group["eps"], group["weight_decay"], bias_correction1, bias_correction2, self.adamw_mode, ) else: self.cpu_adam_op.step( state["step"], group["lr"], beta1, beta2, group["eps"], group["weight_decay"], group["bias_correction"], p.data, p.grad.data, state["exp_avg"], state["exp_avg_sq"], div_scale, ) self._post_update(p, "exp_avg", "exp_avg_sq") elif target_device.type == "cuda": assert state["exp_avg"].device.type == "cuda", "exp_avg should stay on cuda" assert state["exp_avg_sq"].device.type == "cuda", "exp_avg should stay on cuda" # record the state by group and update at once g_l.append(p.grad.data) p_l.append(p.data) m_l.append(state["exp_avg"]) v_l.append(state["exp_avg_sq"]) else: raise RuntimeError if len(g_l) > 0: adamw_mode = 1 if self.adamw_mode else 0 bias_correction = 1 if group["bias_correction"] else 0 multi_tensor_applier( self.gpu_adam_op, self._dummy_overflow_buf, [g_l, p_l, m_l, v_l], group["lr"], group["betas"][0], group["betas"][1], group["eps"], group_step, adamw_mode, bias_correction, group["weight_decay"], div_scale, ) self._post_step() return loss