ColossalAI/colossalai/nn/optimizer/cpu_adam.py

import math
import torch

from colossalai.registry import OPTIMIZERS
from colossalai.nn.optimizer import CPU_ADAM_CNT


@OPTIMIZERS.register_module
class CPUAdam(torch.optim.Optimizer):
    """Implements Adam algorithm.

    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
    But the parameters and gradients should on the same device: 
      * Parameters on CPU and gradients on CPU is allowed.
      * Parameters on GPU and gradients on GPU is allowed.
      * Parameters on GPU and gradients on CPU is **not** allowed.

    Requires ColossalAI to be installed via ``pip install .``.

    This version of CPU Adam accelates parameters updating on CPU with SIMD.
    Support of AVX2 or AVX512 is required.

    The GPU part is implemented in an naive way.

    CPU Adam also supports the hybrid precision calculation, eg. fp32 parameters and fp16 gradients.

    :class:`colossalai.nn.optimizer.CPUAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
    or ``torch.optim.Adam`` with ``adamw_mode=False``

    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        model_params (iterable): iterable of parameters of dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED yet in CPUAdam!
        adamw_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay(also known as AdamW) (default: True)
        simd_log (boolean, optional): whether to show if you are using SIMD to 
            accelerate. (default: False)

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    # Number of fp32 shards for per parameter
    # Param weight, grad, momentum and variance
    num_fp32_shards_per_param = 4

    def __init__(self,
                 model_params,
                 lr=1e-3,
                 bias_correction=True,
                 betas=(0.9, 0.999),
                 eps=1e-8,
                 weight_decay=0,
                 adamw_mode=True,
                 simd_log=False):

        default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)
        super(CPUAdam, self).__init__(model_params, default_args)
        self.opt_id = CPU_ADAM_CNT()
        self.adamw_mode = adamw_mode
        try:
            import cpu_adam
        except ImportError:
            raise ImportError('Please install colossalai from source code to use CPUAdam')
        self.cpu_adam_op = cpu_adam
        self.cpu_adam_op.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode, simd_log)

    def __del__(self):
        if self.cpu_adam_op:
            self.cpu_adam_op.destroy_adam(self.opt_id)

    def torch_adam_update(self,
                          data,
                          grad,
                          exp_avg,
                          exp_avg_sq,
                          lr,
                          beta1,
                          beta2,
                          eps,
                          weight_decay,
                          bias_correction1,
                          bias_correction2,
                          use_adamw=False):
        # FIXME(ver217): remove the below line when replace torch adam with fused adam
        grad = grad.float()

        if weight_decay != 0:
            if use_adamw:
                data.mul_(1 - lr * weight_decay)
            else:
                grad = grad.add(data, alpha=weight_decay)

        # Decay the first and second moment running average coefficient
        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

        # TODO(jiaruifang) dose not support amsgrad
        denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)

        step_size = lr / bias_correction1

        data.addcdiv_(exp_avg, denom, value=-step_size)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for _, group in enumerate(self.param_groups):
            for _, p in enumerate(group['params']):

                if p.grad is None:
                    continue

                state = self.state[p]

                target_device = p.device
                if len(state) == 0:
                    state['step'] = 0

                    # gradient momentums
                    state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)
                    # gradient variances
                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)

                state['step'] += 1
                beta1, beta2 = group['betas']

                if target_device.type == 'cpu':
                    assert p.data.numel() == p.grad.data.numel(), "parameter and gradient should have the same size"
                    assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"
                    assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"
                    self.cpu_adam_op.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],
                                                 group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
                                                 state['exp_avg'], state['exp_avg_sq'], -1)
                elif target_device.type == 'cuda':
                    assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda"
                    assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda"

                    bias_correction1 = 1 - beta1**state['step']
                    bias_correction2 = 1 - beta2**state['step']

                    # adam on cuda
                    self.torch_adam_update(p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], group['lr'],
                                           beta1, beta2, group['eps'], group['weight_decay'], bias_correction1,
                                           bias_correction2, self.adamw_mode)
                else:
                    raise RuntimeError
        return loss
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`import math`
[zero] sharded optim support hybrid cpu adam (#486) * sharded optim support hybrid cpu adam * update unit test * polish docstring 2022-03-22 06:56:59 +00:00			`import torch`

[docs] updatad docs of hybrid adam and cpu adam (#552) 2022-03-30 10:14:59 +00:00			`from colossalai.registry import OPTIMIZERS`
fix bugs in CPU adam (#633) * add cpu adam counter for all cpu adam * fixed updating error in adam kernel 2022-04-02 09:04:05 +00:00			`from colossalai.nn.optimizer import CPU_ADAM_CNT`
[docs] updatad docs of hybrid adam and cpu adam (#552) 2022-03-30 10:14:59 +00:00
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
[docs] updatad docs of hybrid adam and cpu adam (#552) 2022-03-30 10:14:59 +00:00			`@OPTIMIZERS.register_module`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`class CPUAdam(torch.optim.Optimizer):`
[docs] updatad docs of hybrid adam and cpu adam (#552) 2022-03-30 10:14:59 +00:00			`"""Implements Adam algorithm.`

			`Supports parameters updating on both GPU and CPU, depanding on the device of paramters.`
			`But the parameters and gradients should on the same device:`
			`* Parameters on CPU and gradients on CPU is allowed.`
			`* Parameters on GPU and gradients on GPU is allowed.`
			`* Parameters on GPU and gradients on CPU is not allowed.`

			Requires ColossalAI to be installed via ``pip install .``.

			`This version of CPU Adam accelates parameters updating on CPU with SIMD.`
			`Support of AVX2 or AVX512 is required.`

			`The GPU part is implemented in an naive way.`

			`CPU Adam also supports the hybrid precision calculation, eg. fp32 parameters and fp16 gradients.`

			:class:`colossalai.nn.optimizer.CPUAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
			or ``torch.optim.Adam`` with ``adamw_mode=False``

			Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

			`Arguments:`
			`model_params (iterable): iterable of parameters of dicts defining`
			`parameter groups.`
			`lr (float, optional): learning rate. (default: 1e-3)`
			`betas (Tuple[float, float], optional): coefficients used for computing`
			`running averages of gradient and its square. (default: (0.9, 0.999))`
			`eps (float, optional): term added to the denominator to improve`
			`numerical stability. (default: 1e-8)`
			`weight_decay (float, optional): weight decay (L2 penalty) (default: 0)`
			`amsgrad (boolean, optional): whether to use the AMSGrad variant of this`
			algorithm from the paper `On the Convergence of Adam and Beyond`_
			`(default: False) NOT SUPPORTED yet in CPUAdam!`
			`adamw_mode (boolean, optional): Apply L2 regularization or weight decay`
			`True for decoupled weight decay(also known as AdamW) (default: True)`
			`simd_log (boolean, optional): whether to show if you are using SIMD to`
			`accelerate. (default: False)`
polish optimizer docstring (#619) 2022-04-01 08:27:03 +00:00
			`.. _Adam\: A Method for Stochastic Optimization:`
[docs] updatad docs of hybrid adam and cpu adam (#552) 2022-03-30 10:14:59 +00:00			`https://arxiv.org/abs/1412.6980`
			`.. _On the Convergence of Adam and Beyond:`
			`https://openreview.net/forum?id=ryQu7f-RZ`
			`"""`

[zero] sharded optim support hybrid cpu adam (#486) * sharded optim support hybrid cpu adam * update unit test * polish docstring 2022-03-22 06:56:59 +00:00			`# Number of fp32 shards for per parameter`
			`# Param weight, grad, momentum and variance`
			`num_fp32_shards_per_param = 4`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
			`def __init__(self,`
			`model_params,`
			`lr=1e-3,`
			`bias_correction=True,`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`betas=(0.9, 0.999),`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`eps=1e-8,`
			`weight_decay=0,`
			`adamw_mode=True,`
			`simd_log=False):`
[test] make zero engine test really work (#447) 2022-03-17 09:24:25 +00:00
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`super(CPUAdam, self).__init__(model_params, default_args)`
fix bugs in CPU adam (#633) * add cpu adam counter for all cpu adam * fixed updating error in adam kernel 2022-04-02 09:04:05 +00:00			`self.opt_id = CPU_ADAM_CNT()`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 2022-03-25 10:03:54 +00:00			`self.adamw_mode = adamw_mode`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`try:`
			`import cpu_adam`
			`except ImportError:`
			`raise ImportError('Please install colossalai from source code to use CPUAdam')`
			`self.cpu_adam_op = cpu_adam`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`self.cpu_adam_op.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode, simd_log)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
			`def __del__(self):`
[test] make zero engine test really work (#447) 2022-03-17 09:24:25 +00:00			`if self.cpu_adam_op:`
			`self.cpu_adam_op.destroy_adam(self.opt_id)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`def torch_adam_update(self,`
			`data,`
			`grad,`
			`exp_avg,`
			`exp_avg_sq,`
			`lr,`
			`beta1,`
			`beta2,`
			`eps,`
			`weight_decay,`
			`bias_correction1,`
			`bias_correction2,`
			`use_adamw=False):`
[zero] sharded model support the reuse of fp16 shard (#495) * sharded model supports reuse fp16 shard * rename variable * polish code * polish code * polish code 2022-03-23 06:59:59 +00:00			`# FIXME(ver217): remove the below line when replace torch adam with fused adam`
			`grad = grad.float()`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00
			`if weight_decay != 0:`
			`if use_adamw:`
			`data.mul_(1 - lr * weight_decay)`
			`else:`
			`grad = grad.add(data, alpha=weight_decay)`

			`# Decay the first and second moment running average coefficient`
			`exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)`
			`exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)`

			`# TODO(jiaruifang) dose not support amsgrad`
			`denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)`

			`step_size = lr / bias_correction1`

			`data.addcdiv_(exp_avg, denom, value=-step_size)`

[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`@torch.no_grad()`
			`def step(self, closure=None):`
			`loss = None`
			`if closure is not None:`
			`with torch.enable_grad():`
			`loss = closure()`

[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`for _, group in enumerate(self.param_groups):`
			`for _, p in enumerate(group['params']):`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
			`if p.grad is None:`
			`continue`

			`state = self.state[p]`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00
			`target_device = p.device`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`if len(state) == 0:`
			`state['step'] = 0`

			`# gradient momentums`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`state['exp_avg'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`# gradient variances`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=torch.float, device=target_device)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
			`state['step'] += 1`
			`beta1, beta2 = group['betas']`

[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`if target_device.type == 'cpu':`
[zero] improve adaptability for not-shard parameters (#708) * adapt post grad hooks for not-shard parameters * adapt optimizer for not-shard parameters * offload gradients for not-replicated parameters 2022-04-11 05:38:51 +00:00			`assert p.data.numel() == p.grad.data.numel(), "parameter and gradient should have the same size"`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`assert state['exp_avg'].device.type == 'cpu', "exp_avg should stay on cpu"`
			`assert state['exp_avg_sq'].device.type == 'cpu', "exp_avg should stay on cpu"`
			`self.cpu_adam_op.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],`
			`group['weight_decay'], group['bias_correction'], p.data, p.grad.data,`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 2022-03-25 10:03:54 +00:00			`state['exp_avg'], state['exp_avg_sq'], -1)`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`elif target_device.type == 'cuda':`
			`assert state['exp_avg'].device.type == 'cuda', "exp_avg should stay on cuda"`
			`assert state['exp_avg_sq'].device.type == 'cuda', "exp_avg should stay on cuda"`

[zero] improve adaptability for not-shard parameters (#708) * adapt post grad hooks for not-shard parameters * adapt optimizer for not-shard parameters * offload gradients for not-replicated parameters 2022-04-11 05:38:51 +00:00			`bias_correction1 = 1 - beta1**state['step']`
			`bias_correction2 = 1 - beta2**state['step']`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00
			`# adam on cuda`
			`self.torch_adam_update(p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'], group['lr'],`
			`beta1, beta2, group['eps'], group['weight_decay'], bias_correction1,`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 2022-03-25 10:03:54 +00:00			`bias_correction2, self.adamw_mode)`
[zero] hybrid cpu adam (#445) 2022-03-17 07:05:41 +00:00			`else:`
			`raise RuntimeError`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`return loss`