ColossalAI/colossalai/amp/naive_amp/_utils.py

from typing import List
from torch import Tensor


def has_inf_or_nan(tensor):
    try:
        # if tensor is half, the .float() incurs an additional deep copy, but it's necessary if
        # Pytorch's .sum() creates a one-element tensor of the same type as tensor
        # (which is true for some recent version of pytorch).
        tensor_sum = float(tensor.float().sum())
        # More efficient version that can be used if .sum() returns a Python scalar
        # tensor_sum = float(tensor.sum())
    except RuntimeError as instance:
        # We want to check if inst is actually an overflow exception.
        # RuntimeError could come from a different error.
        # If so, we still want the exception to propagate.
        if "value cannot be converted" not in instance.args[0]:
            raise
        return True
    else:
        if tensor_sum == float('inf') or tensor_sum == -float('inf') or tensor_sum != tensor_sum:
            return True
        return False


def zero_gard_by_list(tensor_list: List[Tensor], set_to_none: bool = True) -> None:
    """
    Clear the gradient of a list of tensors,
    Note: copied from torch.optim.optimizer.
    """
    for param in tensor_list:
        if param.grad is not None:
            if set_to_none:
                param.grad = None
            else:
                if param.grad.grad_fn is not None:
                    param.grad.detach_()
                else:
                    param.grad.requires_grad_(False)
                param.grad.zero_()