ColossalAI/colossalai/nn/optimizer/hybrid_adam.py

from typing import Any, Optional

import torch

from colossalai.kernel.op_builder import FusedOptimBuilder
from colossalai.utils import multi_tensor_applier

from .cpu_adam import CPUAdam


class HybridAdam(CPUAdam):
    """Implements Adam algorithm.

    Supports parameters updating on both GPU and CPU, depending on the device of parameters.
    But the parameters and gradients should on the same device:
      * Parameters on CPU and gradients on CPU is allowed.
      * Parameters on GPU and gradients on GPU is allowed.
      * Parameters on GPU and gradients on CPU is **not** allowed.

    `HybridAdam` requires CUDA extensions which can be built during installation or runtime.

    This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.

    * For parameters updating on CPU, it uses CPUAdam.
    * For parameters updating on GPU, it uses FusedAdam.
    * Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.

    :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
    or ``torch.optim.Adam`` with ``adamw_mode=False``

    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

    Arguments:
        model_params (iterable): iterable of parameters of dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED yet in CPUAdam!
        adamw_mode (boolean, optional): Apply L2 regularization or weight decay
            True for decoupled weight decay(also known as AdamW) (default: True)
        simd_log (boolean, optional): whether to show if you are using SIMD to
            accelerate. (default: False)
        nvme_offload_fraction (float, optional): Fraction of optimizer states to be offloaded to NVMe. Defaults to 0.0.
        nvme_offload_dir (Optional[str], optional): Directory to save NVMe offload files.
            If it's ``None``, a random temporary directory will be used. Defaults to None.

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    # Number of fp32 shards for per parameter
    # Param weight, grad, momentum and variance
    num_fp32_shards_per_param = 4

    def __init__(
        self,
        model_params,
        lr=1e-3,
        bias_correction=True,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=0,
        adamw_mode=True,
        nvme_offload_fraction: float = 0.0,
        nvme_offload_dir: Optional[str] = None,
        **defaults: Any,
    ):
        super().__init__(
            model_params,
            lr,
            bias_correction,
            betas,
            eps,
            weight_decay,
            adamw_mode,
            nvme_offload_fraction,
            nvme_offload_dir,
        )
        fused_optim = FusedOptimBuilder().load()
        self.gpu_adam_op = fused_optim.multi_tensor_adam
        self._dummy_overflow_buf = torch.cuda.IntTensor([0])

    @torch.no_grad()
    def step(self, closure=None, div_scale: float = -1):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        self._pre_step("exp_avg", "exp_avg_sq")
        for _, group in enumerate(self.param_groups):
            g_l, p_l, m_l, v_l = [], [], [], []
            group_step = 0
            for _, p in enumerate(group["params"]):
                if p.grad is None:
                    continue

                state = self.state[p]

                target_device = p.device
                if len(state) == 0:
                    state["step"] = 0
                    # gradient momentums
                    state["exp_avg"] = torch.zeros_like(p, device=target_device)
                    # gradient variances
                    state["exp_avg_sq"] = torch.zeros_like(p, device=target_device)
                    self._post_state_init(p)

                state["step"] += 1
                group_step = state["step"]
                beta1, beta2 = group["betas"]

                if target_device.type == "cpu":
                    assert state["exp_avg"].device.type == "cpu", "exp_avg should stay on cpu"
                    assert state["exp_avg_sq"].device.type == "cpu", "exp_avg should stay on cpu"
                    self._pre_update(p, "exp_avg", "exp_avg_sq")
                    if p.grad.dtype is torch.bfloat16:
                        # cpu adam kernel does not support bf16 now
                        bias_correction1 = 1 - beta1 ** state["step"]
                        bias_correction2 = 1 - beta2 ** state["step"]
                        self.torch_adam_update(
                            p.data,
                            p.grad.data,
                            state["exp_avg"],
                            state["exp_avg_sq"],
                            group["lr"],
                            beta1,
                            beta2,
                            group["eps"],
                            group["weight_decay"],
                            bias_correction1,
                            bias_correction2,
                            self.adamw_mode,
                        )
                    else:
                        self.cpu_adam_op.step(
                            state["step"],
                            group["lr"],
                            beta1,
                            beta2,
                            group["eps"],
                            group["weight_decay"],
                            group["bias_correction"],
                            p.data,
                            p.grad.data,
                            state["exp_avg"],
                            state["exp_avg_sq"],
                            div_scale,
                        )
                    self._post_update(p, "exp_avg", "exp_avg_sq")

                elif target_device.type == "cuda":
                    assert state["exp_avg"].device.type == "cuda", "exp_avg should stay on cuda"
                    assert state["exp_avg_sq"].device.type == "cuda", "exp_avg should stay on cuda"

                    # record the state by group and update at once
                    g_l.append(p.grad.data)
                    p_l.append(p.data)
                    m_l.append(state["exp_avg"])
                    v_l.append(state["exp_avg_sq"])

                else:
                    raise RuntimeError
            if len(g_l) > 0:
                adamw_mode = 1 if self.adamw_mode else 0
                bias_correction = 1 if group["bias_correction"] else 0
                multi_tensor_applier(
                    self.gpu_adam_op,
                    self._dummy_overflow_buf,
                    [g_l, p_l, m_l, v_l],
                    group["lr"],
                    group["betas"][0],
                    group["betas"][1],
                    group["eps"],
                    group_step,
                    adamw_mode,
                    bias_correction,
                    group["weight_decay"],
                    div_scale,
                )
        self._post_step()
        return loss
[Gemini] add GeminiAdamOptimizer (#1960) 2 years ago			`from typing import Any, Optional`

[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`import torch`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago
[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 1 year ago			`from colossalai.kernel.op_builder import FusedOptimBuilder`
[Gemini] add GeminiAdamOptimizer (#1960) 2 years ago			`from colossalai.utils import multi_tensor_applier`

[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 1 year ago			`from .cpu_adam import CPUAdam`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago
[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 1 year ago			`class HybridAdam(CPUAdam):`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago			`"""Implements Adam algorithm.`

[nfc]fix typo colossalai/pipeline tensor nn (#3899) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped * fix typo colossalai/pipeline tensor nn 1 year ago			`Supports parameters updating on both GPU and CPU, depending on the device of parameters.`
[Gemini] add GeminiAdamOptimizer (#1960) 2 years ago			`But the parameters and gradients should on the same device:`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago			`* Parameters on CPU and gradients on CPU is allowed.`
			`* Parameters on GPU and gradients on GPU is allowed.`
			`* Parameters on GPU and gradients on CPU is not allowed.`
polish optimizer docstring (#619) 3 years ago
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`HybridAdam` requires CUDA extensions which can be built during installation or runtime.
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago
			`This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.`
polish optimizer docstring (#619) 3 years ago
			`* For parameters updating on CPU, it uses CPUAdam.`
			`* For parameters updating on GPU, it uses FusedAdam.`
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`* Hybrid precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago
			:class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
			or ``torch.optim.Adam`` with ``adamw_mode=False``

			Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.

			`Arguments:`
			`model_params (iterable): iterable of parameters of dicts defining`
			`parameter groups.`
			`lr (float, optional): learning rate. (default: 1e-3)`
			`betas (Tuple[float, float], optional): coefficients used for computing`
			`running averages of gradient and its square. (default: (0.9, 0.999))`
			`eps (float, optional): term added to the denominator to improve`
			`numerical stability. (default: 1e-8)`
			`weight_decay (float, optional): weight decay (L2 penalty) (default: 0)`
			`amsgrad (boolean, optional): whether to use the AMSGrad variant of this`
			algorithm from the paper `On the Convergence of Adam and Beyond`_
			`(default: False) NOT SUPPORTED yet in CPUAdam!`
			`adamw_mode (boolean, optional): Apply L2 regularization or weight decay`
			`True for decoupled weight decay(also known as AdamW) (default: True)`
[Gemini] add GeminiAdamOptimizer (#1960) 2 years ago			`simd_log (boolean, optional): whether to show if you are using SIMD to`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago			`accelerate. (default: False)`
fix nvme docstring (#1450) 2 years ago			`nvme_offload_fraction (float, optional): Fraction of optimizer states to be offloaded to NVMe. Defaults to 0.0.`
			`nvme_offload_dir (Optional[str], optional): Directory to save NVMe offload files.`
[nvme] CPUAdam and HybridAdam support NVMe offload (#1360) * impl nvme optimizer * update cpu adam * add unit test * update hybrid adam * update docstr * add TODOs * update CI * fix CI * fix CI * fix CI path * fix CI path * fix CI path * fix install tensornvme * fix CI * fix CI path * fix CI env variables * test CI * test CI * fix CI * fix nvme optim __del__ * fix adam __del__ * fix nvme optim * fix CI env variables * fix nvme optim import * test CI * test CI * fix CI 2 years ago			If it's ``None``, a random temporary directory will be used. Defaults to None.
polish optimizer docstring (#619) 3 years ago
			`.. _Adam\: A Method for Stochastic Optimization:`
[docs] updatad docs of hybrid adam and cpu adam (#552) 3 years ago			`https://arxiv.org/abs/1412.6980`
			`.. _On the Convergence of Adam and Beyond:`
			`https://openreview.net/forum?id=ryQu7f-RZ`
			`"""`

[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`# Number of fp32 shards for per parameter`
			`# Param weight, grad, momentum and variance`
			`num_fp32_shards_per_param = 4`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`def __init__(`
			`self,`
			`model_params,`
			`lr=1e-3,`
			`bias_correction=True,`
			`betas=(0.9, 0.999),`
			`eps=1e-8,`
			`weight_decay=0,`
			`adamw_mode=True,`
			`nvme_offload_fraction: float = 0.0,`
			`nvme_offload_dir: Optional[str] = None,`
			`**defaults: Any,`
			`):`
			`super().__init__(`
			`model_params,`
			`lr,`
			`bias_correction,`
			`betas,`
			`eps,`
			`weight_decay,`
			`adamw_mode,`
			`nvme_offload_fraction,`
			`nvme_offload_dir,`
			`)`
[setup] support pre-build and jit-build of cuda kernels (#2374) * [setup] support pre-build and jit-build of cuda kernels * polish code * polish code * polish code * polish code * polish code * polish code 2 years ago			`fused_optim = FusedOptimBuilder().load()`
[builder] runtime adam and fused_optim builder (#2184) 2 years ago			`self.gpu_adam_op = fused_optim.multi_tensor_adam`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`self._dummy_overflow_buf = torch.cuda.IntTensor([0])`

			`@torch.no_grad()`
[optimizer] add div_scale for optimizers (#2117) * [optimizer] add div_scale for optimizers * [zero] use div_scale in zero optimizer * fix testing error 2 years ago			`def step(self, closure=None, div_scale: float = -1):`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`loss = None`
			`if closure is not None:`
			`with torch.enable_grad():`
			`loss = closure()`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`self._pre_step("exp_avg", "exp_avg_sq")`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`for _, group in enumerate(self.param_groups):`
			`g_l, p_l, m_l, v_l = [], [], [], []`
			`group_step = 0`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`for _, p in enumerate(group["params"]):`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`if p.grad is None:`
			`continue`

			`state = self.state[p]`

			`target_device = p.device`
			`if len(state) == 0:`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`state["step"] = 0`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`# gradient momentums`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`state["exp_avg"] = torch.zeros_like(p, device=target_device)`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`# gradient variances`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`state["exp_avg_sq"] = torch.zeros_like(p, device=target_device)`
[nvme] CPUAdam and HybridAdam support NVMe offload (#1360) * impl nvme optimizer * update cpu adam * add unit test * update hybrid adam * update docstr * add TODOs * update CI * fix CI * fix CI * fix CI path * fix CI path * fix CI path * fix install tensornvme * fix CI * fix CI path * fix CI env variables * test CI * test CI * fix CI * fix nvme optim __del__ * fix adam __del__ * fix nvme optim * fix CI env variables * fix nvme optim import * test CI * test CI * fix CI 2 years ago			`self._post_state_init(p)`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`state["step"] += 1`
			`group_step = state["step"]`
			`beta1, beta2 = group["betas"]`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`if target_device.type == "cpu":`
			`assert state["exp_avg"].device.type == "cpu", "exp_avg should stay on cpu"`
			`assert state["exp_avg_sq"].device.type == "cpu", "exp_avg should stay on cpu"`
			`self._pre_update(p, "exp_avg", "exp_avg_sq")`
[kernel] support pure fp16 for cpu adam and update gemini optim tests (#4921) * [kernel] support pure fp16 for cpu adam (#4896) * [kernel] fix cpu adam kernel for pure fp16 and update tests (#4919) * [kernel] fix cpu adam * [test] update gemini optim test 1 year ago			`if p.grad.dtype is torch.bfloat16:`
[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 1 year ago			`# cpu adam kernel does not support bf16 now`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`bias_correction1 = 1 - beta1 ** state["step"]`
			`bias_correction2 = 1 - beta2 ** state["step"]`
			`self.torch_adam_update(`
			`p.data,`
			`p.grad.data,`
			`state["exp_avg"],`
			`state["exp_avg_sq"],`
			`group["lr"],`
			`beta1,`
			`beta2,`
			`group["eps"],`
			`group["weight_decay"],`
			`bias_correction1,`
			`bias_correction2,`
			`self.adamw_mode,`
			`)`
[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 1 year ago			`else:`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`self.cpu_adam_op.step(`
			`state["step"],`
			`group["lr"],`
			`beta1,`
			`beta2,`
			`group["eps"],`
			`group["weight_decay"],`
			`group["bias_correction"],`
			`p.data,`
			`p.grad.data,`
			`state["exp_avg"],`
			`state["exp_avg_sq"],`
			`div_scale,`
			`)`
			`self._post_update(p, "exp_avg", "exp_avg_sq")`

			`elif target_device.type == "cuda":`
			`assert state["exp_avg"].device.type == "cuda", "exp_avg should stay on cuda"`
			`assert state["exp_avg_sq"].device.type == "cuda", "exp_avg should stay on cuda"`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago
[NFC]fix typo colossalai/auto_parallel nn utils etc. (#3779) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. 2 years ago			`# record the state by group and update at once`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`g_l.append(p.grad.data)`
			`p_l.append(p.data)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`m_l.append(state["exp_avg"])`
			`v_l.append(state["exp_avg_sq"])`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago
			`else:`
			`raise RuntimeError`
			`if len(g_l) > 0:`
			`adamw_mode = 1 if self.adamw_mode else 0`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`bias_correction = 1 if group["bias_correction"] else 0`
			`multi_tensor_applier(`
			`self.gpu_adam_op,`
			`self._dummy_overflow_buf,`
			`[g_l, p_l, m_l, v_l],`
			`group["lr"],`
			`group["betas"][0],`
			`group["betas"][1],`
			`group["eps"],`
			`group_step,`
			`adamw_mode,`
			`bias_correction,`
			`group["weight_decay"],`
			`div_scale,`
			`)`
[nvme] CPUAdam and HybridAdam support NVMe offload (#1360) * impl nvme optimizer * update cpu adam * add unit test * update hybrid adam * update docstr * add TODOs * update CI * fix CI * fix CI * fix CI path * fix CI path * fix CI path * fix install tensornvme * fix CI * fix CI path * fix CI env variables * test CI * test CI * fix CI * fix nvme optim __del__ * fix adam __del__ * fix nvme optim * fix CI env variables * fix nvme optim import * test CI * test CI * fix CI 2 years ago			`self._post_step()`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 3 years ago			`return loss`