|
|
|
@ -12,9 +12,10 @@ from torch._six import inf
|
|
|
|
|
from torch.nn.parameter import Parameter |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
import colossalai._C.fused_optim |
|
|
|
|
from colossalai._C import fused_optim |
|
|
|
|
except: |
|
|
|
|
pass |
|
|
|
|
from colossalai.kernel.op_builder import FusedOptimBuilder |
|
|
|
|
fused_optim = FusedOptimBuilder().load() |
|
|
|
|
|
|
|
|
|
from collections import defaultdict |
|
|
|
|
from contextlib import contextmanager |
|
|
|
@ -133,7 +134,7 @@ def _calc_l2_norm(grads):
|
|
|
|
|
if len(grads) > 0: |
|
|
|
|
dummy_overflow_buf = torch.cuda.IntTensor([0]) |
|
|
|
|
norm, _ = multi_tensor_applier( |
|
|
|
|
colossalai._C.fused_optim.multi_tensor_l2norm, |
|
|
|
|
fused_optim.multi_tensor_l2norm, |
|
|
|
|
dummy_overflow_buf, |
|
|
|
|
[grads], |
|
|
|
|
False # no per-parameter norm |
|
|
|
@ -270,8 +271,8 @@ def _clip_grad_norm(parameters, max_norm: float, total_norm: float) -> None:
|
|
|
|
|
cpu_grads.append(p.grad.detach()) |
|
|
|
|
if len(cuda_grads) > 0: |
|
|
|
|
dummy_overflow_buf = torch.cuda.IntTensor([0]) |
|
|
|
|
multi_tensor_applier(colossalai._C.fused_optim.multi_tensor_scale, dummy_overflow_buf, |
|
|
|
|
[cuda_grads, cuda_grads], clip_coef) |
|
|
|
|
multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [cuda_grads, cuda_grads], |
|
|
|
|
clip_coef) |
|
|
|
|
for g in cpu_grads: |
|
|
|
|
g.mul_(clip_coef) |
|
|
|
|
|
|
|
|
@ -397,8 +398,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
|
|
|
|
|
if enable_cuda_kernels: |
|
|
|
|
grads = [p.grad.detach() for p in params] |
|
|
|
|
dummy_overflow_buf = torch.cuda.IntTensor([0]) |
|
|
|
|
multi_tensor_applier(colossalai._C.fused_optim.multi_tensor_scale, dummy_overflow_buf, [grads, grads], |
|
|
|
|
clip_coeff) |
|
|
|
|
multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff) |
|
|
|
|
else: |
|
|
|
|
for p in params: |
|
|
|
|
p.grad.detach().mul_(clip_coeff) |
|
|
|
|