2022-03-25 06:15:53 +00:00
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
|
|
|
from torch.optim import AdamW
|
2023-04-06 06:51:35 +00:00
|
|
|
from torch.optim.adam import Adam
|
2022-03-25 06:15:53 +00:00
|
|
|
|
|
|
|
from colossalai.nn.optimizer.fused_adam import FusedAdam
|
2023-04-06 06:51:35 +00:00
|
|
|
from colossalai.testing import clear_cache_before_run, parameterize
|
2022-03-25 06:15:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
class FC(nn.Module):
|
2022-08-05 11:45:45 +00:00
|
|
|
|
2022-03-25 06:15:53 +00:00
|
|
|
def __init__(self) -> None:
|
|
|
|
super().__init__()
|
|
|
|
self.fc = nn.Sequential(nn.Linear(64, 64))
|
2022-08-05 11:45:45 +00:00
|
|
|
|
2022-03-25 06:15:53 +00:00
|
|
|
def forward(self, x):
|
|
|
|
return self.fc(x)
|
|
|
|
|
|
|
|
|
2023-04-06 06:51:35 +00:00
|
|
|
@clear_cache_before_run()
|
2022-03-25 06:15:53 +00:00
|
|
|
@parameterize('adamw', [False, True])
|
|
|
|
@parameterize('p_dtype', [torch.float, torch.half])
|
|
|
|
@parameterize('g_dtype', [torch.float, torch.half])
|
|
|
|
def test_adam(adamw, p_dtype, g_dtype):
|
|
|
|
model = FC().cuda().to(p_dtype)
|
|
|
|
state = model.state_dict()
|
|
|
|
model_copy = FC().cuda().to(p_dtype)
|
|
|
|
model_copy.load_state_dict(state.copy())
|
|
|
|
|
|
|
|
if adamw:
|
|
|
|
optim = FusedAdam(model.parameters(), lr=1e-3, adamw_mode=True)
|
|
|
|
torch_optim = AdamW(model_copy.parameters(), lr=1e-3)
|
|
|
|
else:
|
|
|
|
optim = FusedAdam(model.parameters(), lr=1e-3)
|
|
|
|
torch_optim = Adam(model_copy.parameters(), lr=1e-3)
|
|
|
|
|
|
|
|
data = torch.rand(1024, 64).cuda().to(p_dtype)
|
|
|
|
data_copy = data.clone()
|
|
|
|
label = torch.rand(1024, 64).cuda().to(p_dtype)
|
|
|
|
|
|
|
|
for d, l in zip(data, label):
|
|
|
|
y = model(d)
|
2022-08-05 11:45:45 +00:00
|
|
|
loss = ((l - y)**2).sum()
|
2022-03-25 06:15:53 +00:00
|
|
|
optim.zero_grad()
|
|
|
|
loss.backward()
|
|
|
|
if p_dtype != g_dtype:
|
|
|
|
for i in range(len(optim.param_groups[0]['params'])):
|
|
|
|
optim.param_groups[0]['params'][i].grad.data = optim.param_groups[0]['params'][i].grad.data.to(g_dtype)
|
|
|
|
optim.step()
|
|
|
|
|
|
|
|
for d, l in zip(data_copy, label):
|
|
|
|
y = model_copy(d)
|
2022-08-05 11:45:45 +00:00
|
|
|
loss = ((l - y)**2).sum()
|
2022-03-25 06:15:53 +00:00
|
|
|
torch_optim.zero_grad()
|
|
|
|
loss.backward()
|
|
|
|
torch_optim.step()
|
|
|
|
|
|
|
|
assert len(optim.param_groups[0]['params']) == len(torch_optim.param_groups[0]['params'])
|
2022-08-05 11:45:45 +00:00
|
|
|
|
2022-03-25 06:15:53 +00:00
|
|
|
for i in range(len(optim.param_groups[0]['params'])):
|
|
|
|
if torch.isnan(optim.param_groups[0]['params'][i]).any() \
|
|
|
|
or torch.isnan(torch_optim.param_groups[0]['params'][i]).any():
|
|
|
|
continue
|
|
|
|
assert torch.allclose(optim.param_groups[0]['params'][i], torch_optim.param_groups[0]['params'][i], 2e-3, 2e-3)
|