ColossalAI/tests/test_optimizer/test_cpu_adam.py

import math

import torch

from colossalai.testing import clear_cache_before_run, parameterize


def torch_adam_update(
    step,
    lr,
    beta1,
    beta2,
    eps,
    weight_decay,
    param,
    grad,
    exp_avg,
    exp_avg_sq,
    use_adamw,
):
    bias_correction1 = 1 - beta1**step
    bias_correction2 = 1 - beta2**step

    if weight_decay != 0:
        if use_adamw:
            # Perform stepweight decay
            param.mul_(1 - lr * weight_decay)
        else:
            grad = grad.add(param, alpha=weight_decay)

    # Decay the first and second moment running average coefficient
    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)

    step_size = lr / bias_correction1

    param.addcdiv_(exp_avg, denom, value=-step_size)


def assertLess(data_diff, threshold, msg):
    assert data_diff < threshold, msg


def assertTrue(condition, msg):
    assert condition, msg


@clear_cache_before_run()
@parameterize('adamw', [True, False])
@parameterize('step', [1, 2])
@parameterize('p_dtype', [torch.float, torch.half])
@parameterize('g_dtype', [torch.float, torch.half])
def test_cpu_adam(adamw, step, p_dtype, g_dtype):
    lr = 1e-3
    beta1, beta2 = 0.9, 0.999
    eps = 1e-8
    weight_decay = 0

    for i in range(3):
        p_data = torch.rand(64, dtype=p_dtype)
        p_data_copy = p_data.clone().float()
        p_grad = torch.rand(64, dtype=g_dtype)
        p_grad_copy = p_grad.clone().float()
        exp_avg = torch.rand(p_data.shape)
        exp_avg_copy = exp_avg.clone()
        exp_avg_sq = torch.rand(p_data.shape)
        exp_avg_sq_copy = exp_avg_sq.clone()

        from colossalai.kernel.op_builder import CPUAdamBuilder
        cpu_optim = CPUAdamBuilder().load()

        cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)

        cpu_adam_op.step(
            step,
            lr,
            beta1,
            beta2,
            eps,
            weight_decay,
            True,
            p_data.view(-1),    # fp32 data
            p_grad.view(-1),    # fp32 grad
            exp_avg.view(-1),
            exp_avg_sq.view(-1),
            -1,
        )

        torch_adam_update(
            step,
            lr,
            beta1,
            beta2,
            eps,
            weight_decay,
            p_data_copy,    # fp32 data
            p_grad_copy,    # fp32 grad
            exp_avg_copy,
            exp_avg_sq_copy,
            adamw,
        )
        var = p_data_copy - p_data
        data_diff = torch.max(torch.abs(var))
        threshold = 1e-3
        assertLess(
            data_diff,
            threshold,
            f"p_data diff {data_diff}. failed check, step {step}, lr {lr}, eps "
            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} p_dtype {p_dtype}, g_dtype {g_dtype}",
        )
        max_grad_diff = torch.max(torch.abs(p_grad_copy - p_grad))
        assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")
        max_exp_avg_diff = torch.max(torch.abs(exp_avg_copy - exp_avg))
        assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")
        max_exp_avg_sq_diff = torch.max(torch.abs(exp_avg_sq_copy - exp_avg_sq))
        assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")


if __name__ == '__main__':
    test_cpu_adam()
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`import math`
[kernel] move all symlinks of kernel to `colossalai._C` (#1971) 2022-11-17 05:42:33 +00:00
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`import torch`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00
[test] refactor tests with spawn (#3452) * [test] added spawn decorator * polish code * polish code * polish code * polish code * polish code * polish code 2023-04-06 06:51:35 +00:00			`from colossalai.testing import clear_cache_before_run, parameterize`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
[hotfix] run cpu adam unittest in pytest (#424) 2022-03-16 02:39:55 +00:00
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`def torch_adam_update(`
			`step,`
			`lr,`
			`beta1,`
			`beta2,`
			`eps,`
			`weight_decay,`
			`param,`
			`grad,`
			`exp_avg,`
			`exp_avg_sq,`
			`use_adamw,`
			`):`
[hotfix] run cpu adam unittest in pytest (#424) 2022-03-16 02:39:55 +00:00			`bias_correction1 = 1 - beta1**step`
			`bias_correction2 = 1 - beta2**step`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00
			`if weight_decay != 0:`
			`if use_adamw:`
			`# Perform stepweight decay`
			`param.mul_(1 - lr * weight_decay)`
			`else:`
			`grad = grad.add(param, alpha=weight_decay)`

			`# Decay the first and second moment running average coefficient`
			`exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)`
			`exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)`
			`denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)`

			`step_size = lr / bias_correction1`

			`param.addcdiv_(exp_avg, denom, value=-step_size)`


[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`def assertLess(data_diff, threshold, msg):`
			`assert data_diff < threshold, msg`


			`def assertTrue(condition, msg):`
			`assert condition, msg`


[test] refactor tests with spawn (#3452) * [test] added spawn decorator * polish code * polish code * polish code * polish code * polish code * polish code 2023-04-06 06:51:35 +00:00			`@clear_cache_before_run()`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`@parameterize('adamw', [True, False])`
			`@parameterize('step', [1, 2])`
			`@parameterize('p_dtype', [torch.float, torch.half])`
			`@parameterize('g_dtype', [torch.float, torch.half])`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 2022-03-25 10:03:54 +00:00			`def test_cpu_adam(adamw, step, p_dtype, g_dtype):`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`lr = 1e-3`
			`beta1, beta2 = 0.9, 0.999`
			`eps = 1e-8`
			`weight_decay = 0`
[hotfix] fix CPUAdam kernel nullptr (#1410) 2022-08-05 11:45:45 +00:00
[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`for i in range(3):`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`p_data = torch.rand(64, dtype=p_dtype)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`p_data_copy = p_data.clone().float()`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`p_grad = torch.rand(64, dtype=g_dtype)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`p_grad_copy = p_grad.clone().float()`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`exp_avg = torch.rand(p_data.shape)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`exp_avg_copy = exp_avg.clone()`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`exp_avg_sq = torch.rand(p_data.shape)`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`exp_avg_sq_copy = exp_avg_sq.clone()`

[setup] support pre-build and jit-build of cuda kernels (#2374) * [setup] support pre-build and jit-build of cuda kernels * polish code * polish code * polish code * polish code * polish code * polish code 2023-01-06 12:50:26 +00:00			`from colossalai.kernel.op_builder import CPUAdamBuilder`
			`cpu_optim = CPUAdamBuilder().load()`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00
[buider] use builder() for cpu adam and fused optim in setup.py (#2187) 2022-12-23 08:05:13 +00:00			`cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)`

[hotfix] fix CPUAdam kernel nullptr (#1410) 2022-08-05 11:45:45 +00:00			`cpu_adam_op.step(`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`step,`
			`lr,`
			`beta1,`
			`beta2,`
			`eps,`
			`weight_decay,`
			`True,`
[hotfix] run cpu adam unittest in pytest (#424) 2022-03-16 02:39:55 +00:00			`p_data.view(-1), # fp32 data`
			`p_grad.view(-1), # fp32 grad`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`exp_avg.view(-1),`
			`exp_avg_sq.view(-1),`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 2022-03-25 10:03:54 +00:00			`-1,`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`)`

			`torch_adam_update(`
			`step,`
			`lr,`
			`beta1,`
			`beta2,`
			`eps,`
			`weight_decay,`
[hotfix] run cpu adam unittest in pytest (#424) 2022-03-16 02:39:55 +00:00			`p_data_copy, # fp32 data`
			`p_grad_copy, # fp32 grad`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`exp_avg_copy,`
			`exp_avg_sq_copy,`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`adamw,`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`)`
			`var = p_data_copy - p_data`
			`data_diff = torch.max(torch.abs(var))`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`threshold = 1e-3`
			`assertLess(`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`data_diff,`
			`threshold,`
[zero]added hybrid adam, removed loss scale in adam (#527) * [zero]added hybrid adam, removed loss scale of adam * remove useless code 2022-03-25 10:03:54 +00:00			`f"p_data diff {data_diff}. failed check, step {step}, lr {lr}, eps "`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} p_dtype {p_dtype}, g_dtype {g_dtype}",`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`)`
			`max_grad_diff = torch.max(torch.abs(p_grad_copy - p_grad))`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`max_exp_avg_diff = torch.max(torch.abs(exp_avg_copy - exp_avg))`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")`
[zero] cpu adam kernel (#288) * Added CPU Adam * finished the cpu adam * updated the license * delete useless parameters, removed resnet * modified the method off cpu adam unittest * deleted some useless codes * removed useless codes Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: jiaruifang <fangjiarui123@gmail.com> 2022-03-04 08:05:15 +00:00			`max_exp_avg_sq_diff = torch.max(torch.abs(exp_avg_sq_copy - exp_avg_sq))`
[cuda] modify the fused adam, support hybrid of fp16 and fp32 (#497) 2022-03-25 06:15:53 +00:00			`assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")`
[builder] runtime adam and fused_optim builder (#2184) 2022-12-23 06:14:21 +00:00

			`if __name__ == '__main__':`
			`test_cpu_adam()`