From 6a3f9fda83ed383f43e3bb0c36158309ff3faa3d Mon Sep 17 00:00:00 2001
From: LuGY <74758262+Gy-Lu@users.noreply.github.com>
Date: Fri, 25 Mar 2022 14:15:53 +0800
Subject: [PATCH] [cuda] modify the fused adam, support hybrid of fp16 and fp32
 (#497)

---
 .../cuda_native/csrc/multi_tensor_adam.cu     |  18 +--
 .../kernel/cuda_native/csrc/type_shim.h       |  30 ++++
 colossalai/nn/optimizer/fused_adam.py         |  41 ++---
 tests/test_optimizer/unittest_cpu_adam.py     | 148 +++++-------------
 tests/test_optimizer/unittest_fused_adam.py   |  61 ++++++++
 .../unittest_fused_adam_kernel.py             |  98 ++++++++++++
 6 files changed, 253 insertions(+), 143 deletions(-)
 create mode 100644 tests/test_optimizer/unittest_fused_adam.py
 create mode 100644 tests/test_optimizer/unittest_fused_adam_kernel.py
diff --git a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
index 633e2d63f..eb854ce93 100644
--- a/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
+++ b/colossalai/kernel/cuda_native/csrc/multi_tensor_adam.cu
@@ -22,7 +22,7 @@ typedef enum
 
 using MATH_T = float;
 
-template <typename T>
+template <typename T_g, typename T_p>
 struct AdamFunctor
 {
     __device__ __forceinline__ void operator()(
@@ -50,16 +50,16 @@ struct AdamFunctor
         int chunk_idx = tl.block_to_chunk[blockIdx.x];
         int n = tl.sizes[tensor_loc];
 
-        T *g = (T *)tl.addresses[0][tensor_loc];
+        T_g *g = (T_g *)tl.addresses[0][tensor_loc];
         g += chunk_idx * chunk_size;
 
-        T *p = (T *)tl.addresses[1][tensor_loc];
+        T_p *p = (T_p *)tl.addresses[1][tensor_loc];
         p += chunk_idx * chunk_size;
 
-        T *m = (T *)tl.addresses[2][tensor_loc];
+        T_p *m = (T_p *)tl.addresses[2][tensor_loc];
         m += chunk_idx * chunk_size;
 
-        T *v = (T *)tl.addresses[3][tensor_loc];
+        T_p *v = (T_p *)tl.addresses[3][tensor_loc];
         v += chunk_idx * chunk_size;
 
         n -= chunk_idx * chunk_size;
@@ -155,15 +155,15 @@ void multi_tensor_adam_cuda(
         bias_correction2 = 1 - std::pow(beta2, step);
     }
 
-    // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(
-        tensor_lists[0][0].scalar_type(), 0, "adam",
+    DISPATCH_FLOAT_AND_HALF_FOR_G_P(
+        tensor_lists[0][0].scalar_type(),
+        tensor_lists[1][0].scalar_type(), 0, "adam",
         multi_tensor_apply<4>(
             BLOCK_SIZE,
             chunk_size,
             noop_flag,
             tensor_lists,
-            AdamFunctor<scalar_t_0>(),
+            AdamFunctor<g_scalar_t_0, p_scalar_t_0>(),
             beta1,
             beta2,
             bias_correction1,
diff --git a/colossalai/kernel/cuda_native/csrc/type_shim.h b/colossalai/kernel/cuda_native/csrc/type_shim.h
index 2cae99a2d..cf83414af 100644
--- a/colossalai/kernel/cuda_native/csrc/type_shim.h
+++ b/colossalai/kernel/cuda_native/csrc/type_shim.h
@@ -173,6 +173,36 @@
         AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
     }
 
+#define DISPATCH_FLOAT_AND_HALF_FOR_G_P(GTYPE, PTYPE, LEVEL, NAME, ...)                          \
+    if (GTYPE == at::ScalarType::Float && PTYPE == at::ScalarType::Float)                        \
+    {                                                                                            \
+        using g_scalar_t_##LEVEL = float;                                                        \
+        using p_scalar_t_##LEVEL = float;                                                        \
+        __VA_ARGS__;                                                                             \
+    }                                                                                            \
+    else if (GTYPE == at::ScalarType::Float && PTYPE == at::ScalarType::Half)                    \
+    {                                                                                            \
+        using g_scalar_t_##LEVEL = float;                                                        \
+        using p_scalar_t_##LEVEL = at::Half;                                                     \
+        __VA_ARGS__;                                                                             \
+    }                                                                                            \
+    else if (GTYPE == at::ScalarType::Half && PTYPE == at::ScalarType::Float)                    \
+    {                                                                                            \
+        using g_scalar_t_##LEVEL = at::Half;                                                     \
+        using p_scalar_t_##LEVEL = float;                                                        \
+        __VA_ARGS__;                                                                             \
+    }                                                                                            \
+    else if (GTYPE == at::ScalarType::Half && PTYPE == at::ScalarType::Half)                     \
+    {                                                                                            \
+        using g_scalar_t_##LEVEL = at::Half;                                                     \
+        using p_scalar_t_##LEVEL = at::Half;                                                     \
+        __VA_ARGS__;                                                                             \
+    }                                                                                            \
+    else                                                                                         \
+    {                                                                                            \
+       AT_ERROR(#NAME, "not implemented for '", toString(GTYPE), toString(PTYPE), "'");          \
+    }                                                                                            \
+
 template <typename T>
 __device__ __forceinline__ T reduce_block_into_lanes(T *x,
                                                      T val,
diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py
index e5c17c3e1..465e000a1 100644
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
@@ -10,7 +10,7 @@ class FusedAdam(torch.optim.Optimizer):
     """Implements Adam algorithm.
 
     Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
+    ``pip install .``.
 
     This version of fused Adam implements 2 fusions.
 
@@ -18,7 +18,7 @@ class FusedAdam(torch.optim.Optimizer):
       * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
 
     :class:`colossalai.nn.optimizer.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
-    or ``torch.optim.Adam`` with ``adam_w_mode=False``
+    or ``torch.optim.Adam`` with ``adamw_mode=False``
 
     :class:`colossalai.nn.optimizer.FusedAdam` may be used with or without Amp. 
 
@@ -36,7 +36,7 @@ class FusedAdam(torch.optim.Optimizer):
         amsgrad (boolean, optional): whether to use the AMSGrad variant of this
             algorithm from the paper `On the Convergence of Adam and Beyond`_
             (default: False) NOT SUPPORTED in FusedAdam!
-        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+        adamw_mode (boolean, optional): Apply L2 regularization or weight decay
             True for decoupled weight decay(also known as AdamW) (default: True)
         set_grad_none (bool, optional): whether set grad to None when zero_grad()
             method is called. (default: True)
@@ -53,7 +53,7 @@ class FusedAdam(torch.optim.Optimizer):
                  bias_correction=True,
                  betas=(0.9, 0.999),
                  eps=1e-8,
-                 adam_w_mode=True,
+                 adamw_mode=True,
                  weight_decay=0.,
                  amsgrad=False,
                  set_grad_none=True):
@@ -62,7 +62,7 @@ class FusedAdam(torch.optim.Optimizer):
             raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
         defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay)
         super(FusedAdam, self).__init__(params, defaults)
-        self.adam_w_mode = 1 if adam_w_mode else 0
+        self.adamw_mode = 1 if adamw_mode else 0
         self.set_grad_none = set_grad_none
         if multi_tensor_applier.available:
             import colossal_C
@@ -109,8 +109,7 @@ class FusedAdam(torch.optim.Optimizer):
                 group['step'] = 1
 
             # create lists for multi-tensor apply
-            g_16, p_16, m_16, v_16 = [], [], [], []
-            g_32, p_32, m_32, v_32 = [], [], [], []
+            g_l, p_l, m_l, v_l = [], [], [], []
 
             for p in group['params']:
                 if p.grad is None:
@@ -127,26 +126,16 @@ class FusedAdam(torch.optim.Optimizer):
                     # Exponential moving average of squared gradient values
                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 
-                if p.dtype == torch.float16:
-                    g_16.append(p.grad.data)
-                    p_16.append(p.data)
-                    m_16.append(state['exp_avg'])
-                    v_16.append(state['exp_avg_sq'])
-                elif p.dtype == torch.float32:
-                    g_32.append(p.grad.data)
-                    p_32.append(p.data)
-                    m_32.append(state['exp_avg'])
-                    v_32.append(state['exp_avg_sq'])
-                else:
+                if p.dtype not in [torch.float16, torch.float32]:
                     raise RuntimeError('FusedAdam only support fp16 and fp32.')
+                
+                g_l.append(p.grad.data)
+                p_l.append(p.data)
+                m_l.append(state['exp_avg'])
+                v_l.append(state['exp_avg_sq'])
 
-            if (len(g_16) > 0):
-                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_16, p_16, m_16, v_16],
-                                     group['lr'], beta1, beta2, group['eps'], group['step'], self.adam_w_mode,
-                                     bias_correction, group['weight_decay'])
-            if (len(g_32) > 0):
-                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
-                                     group['lr'], beta1, beta2, group['eps'], group['step'], self.adam_w_mode,
-                                     bias_correction, group['weight_decay'])
+            multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_l, p_l, m_l, v_l],
+                                    group['lr'], beta1, beta2, group['eps'], group['step'], self.adamw_mode,
+                                    bias_correction, group['weight_decay'])
 
         return loss
diff --git a/tests/test_optimizer/unittest_cpu_adam.py b/tests/test_optimizer/unittest_cpu_adam.py
index 401fc5241..55ba74f7e 100644
--- a/tests/test_optimizer/unittest_cpu_adam.py
+++ b/tests/test_optimizer/unittest_cpu_adam.py
@@ -1,38 +1,7 @@
-# BSD 3-Clause License
-#
-# Copyright (C) 2021 THL A29 Limited, a Tencent company.  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * Neither the name of the psutil authors nor the names of its contributors
-#    may be used to endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
 import math
 import torch
-try:
-    import cpu_adam
-except ImportError:
-    raise ImportError("import cpu_adam error")
+
+from colossalai.testing import parameterize
 
 
 def torch_adam_update(
@@ -71,45 +40,46 @@ def torch_adam_update(
     param.addcdiv_(exp_avg, denom, value=-step_size)
 
 
-class Test():
+def assertLess(data_diff, threshold, msg):
+    assert data_diff < threshold, msg
 
-    def __init__(self):
-        self.opt_id = 0
 
-    def assertLess(self, data_diff, threshold, msg):
-        assert data_diff < threshold, msg
+def assertTrue(condition, msg):
+    assert condition, msg
 
-    def assertTrue(self, condition, msg):
-        assert condition, msg
 
-    def check_res(
-        self,
-        step,
-        lr,
-        eps,
-        beta1,
-        beta2,
-        weight_decay,
-        shape,
-        grad_dtype,
-        loss_scale,
-        use_adamw,
-        cpu_adam_op,
-    ):
-        p_data = torch.rand(shape, dtype=grad_dtype)
+@parameterize('adamw', [True, False])
+@parameterize('step', [1, 2])
+@parameterize('loss_scale', [-1, 2 ** 5])
+@parameterize('p_dtype', [torch.float, torch.half])
+@parameterize('g_dtype', [torch.float, torch.half])
+def test_cpu_adam(adamw, step, loss_scale, p_dtype, g_dtype):
+    lr = 1e-3
+    beta1, beta2 = 0.9, 0.999
+    eps = 1e-8
+    weight_decay = 0
+    
+    for i in range(1024):
+        p_data = torch.rand(64, dtype=p_dtype)
         p_data_copy = p_data.clone().float()
-        p_grad = torch.rand(shape, dtype=grad_dtype)
+        p_grad = torch.rand(64, dtype=g_dtype)
         if loss_scale > 0:
             p_grad.mul_(loss_scale)
         p_grad_copy = p_grad.clone().float()
-        exp_avg = torch.rand(shape)
+        exp_avg = torch.rand(p_data.shape)
         exp_avg_copy = exp_avg.clone()
-        exp_avg_sq = torch.rand(shape)
+        exp_avg_sq = torch.rand(p_data.shape)
         exp_avg_sq_copy = exp_avg_sq.clone()
 
-        cpu_adam_op.create_adam(0, lr, beta1, beta2, eps, weight_decay, use_adamw, True)
+        try:
+            import cpu_adam
+            cpu_adam_op = cpu_adam
+        except:
+            raise ImportError("...")
+
+        cpu_adam_op.create_adam(0, lr, beta1, beta2, eps, weight_decay, adamw, False)
         cpu_adam_op.adam_update(
-            self.opt_id,
+            0,
             step,
             lr,
             beta1,
@@ -136,62 +106,24 @@ class Test():
             exp_avg_copy,
             exp_avg_sq_copy,
             loss_scale,
-            use_adamw,
+            adamw,
         )
-
         if loss_scale > 0:
             p_grad.div_(loss_scale)
-
         var = p_data_copy - p_data
         data_diff = torch.max(torch.abs(var))
-        threshold = 2e-3 if grad_dtype else 1e-4
-        self.assertLess(
+        threshold = 1e-3
+        print(f"p_data diff {data_diff}. failed check, step {step}, lr {lr} eps "
+            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} p_dtype {p_dtype}, g_dtype {g_dtype}")
+        assertLess(
             data_diff,
             threshold,
-            f"p_data diff {data_diff}. failed check, step {step}, lr {lr} eps "
-            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} loss_scale {loss_scale} grad_dtype {grad_dtype}",
+            f"p_data diff {data_diff}. failed check, step {step}, lr {lr}, loss_scale {loss_scale}, eps "
+            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} p_dtype {p_dtype}, g_dtype {g_dtype}",
         )
         max_grad_diff = torch.max(torch.abs(p_grad_copy - p_grad))
-        self.assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")
+        assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")
         max_exp_avg_diff = torch.max(torch.abs(exp_avg_copy - exp_avg))
-        self.assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")
+        assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")
         max_exp_avg_sq_diff = torch.max(torch.abs(exp_avg_sq_copy - exp_avg_sq))
-        self.assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")
-
-    def test_cpu_adam(self):
-        lr = 0.9
-        eps = 1e-6
-        weight_decay = 0
-        for use_adamw in [False, True]:
-            for shape in [(23,), (8, 24)]:
-                for step in range(1, 2):
-                    for lr in [0.01]:
-                        for eps in [1e-8]:
-                            for beta1 in [0.9]:
-                                for beta2 in [0.999]:
-                                    for weight_decay in [0.001]:
-                                        for grad_dtype in [torch.half, torch.float]:
-                                            for loss_scale in [-1, 2**5]:
-                                                self.check_res(
-                                                    step,
-                                                    lr,
-                                                    eps,
-                                                    beta1,
-                                                    beta2,
-                                                    weight_decay,
-                                                    shape,
-                                                    grad_dtype,
-                                                    loss_scale,
-                                                    use_adamw,
-                                                    cpu_adam,
-                                                )
-
-
-def test_cpu_adam():
-    test_case = Test()
-    test_case.test_cpu_adam()
-
-
-if __name__ == "__main__":
-    test = Test()
-    test.test_cpu_adam()
+        assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")
diff --git a/tests/test_optimizer/unittest_fused_adam.py b/tests/test_optimizer/unittest_fused_adam.py
new file mode 100644
index 000000000..f3b2b2a7b
--- /dev/null
+++ b/tests/test_optimizer/unittest_fused_adam.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+from torch.optim.adam import Adam
+from torch.optim import AdamW
+
+from colossalai.nn.optimizer.fused_adam import FusedAdam
+from colossalai.testing import parameterize
+
+
+class FC(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc = nn.Sequential(nn.Linear(64, 64))
+    def forward(self, x):
+        return self.fc(x)
+
+
+@parameterize('adamw', [False, True])
+@parameterize('p_dtype', [torch.float, torch.half])
+@parameterize('g_dtype', [torch.float, torch.half])
+def test_adam(adamw, p_dtype, g_dtype):
+    model = FC().cuda().to(p_dtype)
+    state = model.state_dict()
+    model_copy = FC().cuda().to(p_dtype)
+    model_copy.load_state_dict(state.copy())
+
+    if adamw:
+        optim = FusedAdam(model.parameters(), lr=1e-3, adamw_mode=True)
+        torch_optim = AdamW(model_copy.parameters(), lr=1e-3)
+    else:
+        optim = FusedAdam(model.parameters(), lr=1e-3)
+        torch_optim = Adam(model_copy.parameters(), lr=1e-3)
+
+    data = torch.rand(1024, 64).cuda().to(p_dtype)
+    data_copy = data.clone()
+    label = torch.rand(1024, 64).cuda().to(p_dtype)
+
+    for d, l in zip(data, label):
+        y = model(d)
+        loss = ((l - y) ** 2).sum()
+        optim.zero_grad()
+        loss.backward()
+        if p_dtype != g_dtype:
+            for i in range(len(optim.param_groups[0]['params'])):
+                optim.param_groups[0]['params'][i].grad.data = optim.param_groups[0]['params'][i].grad.data.to(g_dtype)
+        optim.step()
+
+    for d, l in zip(data_copy, label):
+        y = model_copy(d)
+        loss = ((l - y) ** 2).sum()
+        torch_optim.zero_grad()
+        loss.backward()
+        torch_optim.step()
+
+    assert len(optim.param_groups[0]['params']) == len(torch_optim.param_groups[0]['params'])
+    
+    for i in range(len(optim.param_groups[0]['params'])):
+        if torch.isnan(optim.param_groups[0]['params'][i]).any() \
+           or torch.isnan(torch_optim.param_groups[0]['params'][i]).any():
+            continue
+        assert torch.allclose(optim.param_groups[0]['params'][i], torch_optim.param_groups[0]['params'][i], 2e-3, 2e-3)
diff --git a/tests/test_optimizer/unittest_fused_adam_kernel.py b/tests/test_optimizer/unittest_fused_adam_kernel.py
new file mode 100644
index 000000000..7de256e78
--- /dev/null
+++ b/tests/test_optimizer/unittest_fused_adam_kernel.py
@@ -0,0 +1,98 @@
+from numpy import dtype
+import torch
+import torch.nn as nn
+
+import math
+
+from colossalai.testing import parameterize
+from colossalai.utils import multi_tensor_applier
+
+def torch_adam_update(
+    step,
+    lr,
+    beta1,
+    beta2,
+    eps,
+    weight_decay,
+    param,
+    grad,
+    exp_avg,
+    exp_avg_sq,
+    loss_scale,
+    use_adamw,
+):
+    if loss_scale > 0:
+        grad.div_(loss_scale)
+    bias_correction1 = 1 - beta1**step
+    bias_correction2 = 1 - beta2**step
+
+    if weight_decay != 0:
+        if use_adamw:
+            # Perform stepweight decay
+            param.mul_(1 - lr * weight_decay)
+        else:
+            grad = grad.add(param, alpha=weight_decay)
+
+    # Decay the first and second moment running average coefficient
+    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+
+    step_size = lr / bias_correction1
+
+    param.addcdiv_(exp_avg, denom, value=-step_size)
+
+
+@parameterize('adamw', [False, True])
+@parameterize('step', [1, 2])
+@parameterize('p_dtype', [torch.float, torch.half])
+@parameterize('g_dtype', [torch.float, torch.half])
+def test_adam(adamw, step, p_dtype, g_dtype):
+    try:
+        import colossal_C
+        fused_adam = colossal_C.multi_tensor_adam
+        dummy_overflow_buf = torch.cuda.IntTensor([0])
+    except:
+        raise ImportError("No colossal_C kernel installed.")
+    
+    count = 0
+
+    for i in range(1024):
+        p = torch.rand(64, dtype=p_dtype).cuda()
+        p_copy = p.clone().float()
+        g = torch.rand(p.shape, dtype=g_dtype).cuda()
+        g_copy = g.clone().float()
+        m = torch.rand(p.shape).cuda()
+        m_copy = m.clone()
+        v = torch.rand(p.shape).cuda()
+        v_copy = v.clone()
+
+        lr = 1e-3
+        beta1, beta2 = 0.9, 0.999
+        eps = 1e-8
+        weight_decay = 0
+
+        multi_tensor_applier(fused_adam, dummy_overflow_buf, [[g], [p], [m], [v]],
+                                        lr, beta1, beta2, eps, step, adamw,
+                                        True, weight_decay)
+
+        torch_adam_update(
+                step,
+                lr,
+                beta1,
+                beta2,
+                eps,
+                weight_decay,
+                p_copy,    # fp32 data
+                g_copy,    # fp32 grad
+                m_copy,
+                v_copy,
+                -1,
+                adamw,
+            )
+        
+        if torch.isnan(p).any() or torch.isnan(p_copy).any():
+            count += 1
+            continue
+        assert count < 200, "too many nans"
+        assert torch.allclose(p.to(torch.float), p_copy.to(torch.float), 1e-5, 1e-5), f"failed check, adamw {adamw}, p_dtype {p_dtype}, g_dtype {g_dtype}"