ColossalAI/colossalai/booster/mixed_precision/fp16_torch.py

from typing import Callable, Optional, Tuple, Union

import torch
import torch.nn as nn
from torch import Tensor
from torch.optim import Optimizer

from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.utils.device import autocast

from .mixed_precision_base import MixedPrecision

__all__ = ["FP16_Torch_MixedPrecision", "TorchAMPOptimizer", "TorchAMPModule"]


class TorchAMPOptimizer(OptimizerWrapper):
    """
    Optimizer wrapper for mixed precision training in FP16 using PyTorch AMP.

    Args:
        optim (Optimizer): Optimizer to wrap.
        init_scale (float): Initial scale factor. Default: 2**16.
        growth_factor (float): Factor by which the scale is multiplied during
            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
            this iteration. Default: 2.0.
        backoff_factor (float): Factor by which the scale is multiplied during
            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
            this iteration. Default: 0.5.
        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
            calls that may cause the scale to increase. Default: 2000.
    """

    def __init__(
        self,
        optim: Optimizer,
        init_scale: float = 2.0**16,
        growth_factor: float = 2.0,
        backoff_factor: float = 0.5,
        growth_interval: int = 2000,
    ) -> None:
        super().__init__(optim)
        self.scaler = torch.cuda.amp.GradScaler(
            init_scale=init_scale,
            growth_factor=growth_factor,
            backoff_factor=backoff_factor,
            growth_interval=growth_interval,
        )

    def backward(self, loss: Tensor, *args, **kwargs) -> None:
        scaled_loss = self.scale_loss(loss)
        scaled_loss.backward(*args, **kwargs)

    def step(self, *args, **kwargs) -> Optional[float]:
        out = self.scaler.step(self.optim, *args, **kwargs)
        self.scaler.update()
        return out

    def scale_loss(self, loss: Tensor) -> Tensor:
        return self.scaler.scale(loss)

    def unscale_grad(self) -> None:
        self.scaler.unscale_(self.optim)

    def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
        self.unscale_grad()
        super().clip_grad_by_value(clip_value, *args, **kwargs)

    def clip_grad_by_norm(
        self,
        max_norm: Union[float, int],
        norm_type: Union[float, int] = 2.0,
        error_if_nonfinite: bool = False,
        *args,
        **kwargs,
    ) -> None:
        self.unscale_grad()
        super().clip_grad_by_norm(max_norm, norm_type, error_if_nonfinite, *args, **kwargs)


class TorchAMPModule(ModelWrapper):
    """
    Module wrapper for mixed precision training in FP16 using PyTorch AMP.

    Args:
        module (nn.Module): Module to wrap.
    """

    def __init__(self, module: nn.Module):
        super().__init__(module)

    def forward(self, *args, **kwargs):
        with autocast():
            return self.module(*args, **kwargs)


class FP16TorchMixedPrecision(MixedPrecision):
    """
    Precision for mixed precision training in FP16 using PyTorch AMP.

    Args:
        init_scale (float): Initial scale factor. Default: 2**16.
        growth_factor (float): Factor by which the scale is multiplied during
            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
            this iteration. Default: 2.0.
        backoff_factor (float): Factor by which the scale is multiplied during
            :meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
            this iteration. Default: 0.5.
        growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
            calls that may cause the scale to increase. Default: 2000.
    """

    def __init__(
        self,
        init_scale: float = 2.0**16,
        growth_factor: float = 2.0,
        backoff_factor: float = 0.5,
        growth_interval: int = 2000,
    ) -> None:
        super().__init__()
        self.torch_amp_kwargs = dict(
            init_scale=init_scale,
            growth_factor=growth_factor,
            backoff_factor=backoff_factor,
            growth_interval=growth_interval,
        )

    def configure(
        self,
        model: nn.Module,
        optimizer: Optional[Optimizer] = None,
        criterion: Optional[Callable] = None,
    ) -> Tuple[nn.Module, OptimizerWrapper, Callable]:
        model = TorchAMPModule(model)
        if optimizer is not None:
            optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)
        if criterion is not None:
            criterion = TorchAMPModule(criterion)
        return model, optimizer, criterion
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`from typing import Callable, Optional, Tuple, Union`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00
			`import torch`
			`import torch.nn as nn`
			`from torch import Tensor`
			`from torch.optim import Optimizer`

[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`from colossalai.interface import ModelWrapper, OptimizerWrapper`
[npu] add npu support for hybrid plugin and llama (#5090) * llama 3d * update * fix autocast 2023-11-22 11:23:21 +00:00			`from colossalai.utils.device import autocast`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`from .mixed_precision_base import MixedPrecision`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`__all__ = ["FP16_Torch_MixedPrecision", "TorchAMPOptimizer", "TorchAMPModule"]`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00

			`class TorchAMPOptimizer(OptimizerWrapper):`
			`"""`
			`Optimizer wrapper for mixed precision training in FP16 using PyTorch AMP.`

			`Args:`
			`optim (Optimizer): Optimizer to wrap.`
			`init_scale (float): Initial scale factor. Default: 2**16.`
			`growth_factor (float): Factor by which the scale is multiplied during`
			:meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
			`this iteration. Default: 2.0.`
			`backoff_factor (float): Factor by which the scale is multiplied during`
			:meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
			`this iteration. Default: 0.5.`
			growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
			`calls that may cause the scale to increase. Default: 2000.`
			`"""`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`def __init__(`
			`self,`
			`optim: Optimizer,`
			`init_scale: float = 2.0**16,`
			`growth_factor: float = 2.0,`
			`backoff_factor: float = 0.5,`
			`growth_interval: int = 2000,`
			`) -> None:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`super().__init__(optim)`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`self.scaler = torch.cuda.amp.GradScaler(`
			`init_scale=init_scale,`
			`growth_factor=growth_factor,`
			`backoff_factor=backoff_factor,`
			`growth_interval=growth_interval,`
			`)`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00
			`def backward(self, loss: Tensor, args, *kwargs) -> None:`
			`scaled_loss = self.scale_loss(loss)`
			`scaled_loss.backward(args, *kwargs)`

			`def step(self, args, *kwargs) -> Optional[float]:`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`out = self.scaler.step(self.optim, args, *kwargs)`
			`self.scaler.update()`
			`return out`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00
			`def scale_loss(self, loss: Tensor) -> Tensor:`
			`return self.scaler.scale(loss)`

			`def unscale_grad(self) -> None:`
			`self.scaler.unscale_(self.optim)`

			`def clip_grad_by_value(self, clip_value: float, args, *kwargs) -> None:`
			`self.unscale_grad()`
			`super().clip_grad_by_value(clip_value, args, *kwargs)`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`def clip_grad_by_norm(`
			`self,`
			`max_norm: Union[float, int],`
			`norm_type: Union[float, int] = 2.0,`
			`error_if_nonfinite: bool = False,`
			`*args,`
			`**kwargs,`
			`) -> None:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`self.unscale_grad()`
			`super().clip_grad_by_norm(max_norm, norm_type, error_if_nonfinite, args, *kwargs)`


[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`class TorchAMPModule(ModelWrapper):`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`"""`
			`Module wrapper for mixed precision training in FP16 using PyTorch AMP.`

			`Args:`
			`module (nn.Module): Module to wrap.`
			`"""`

			`def __init__(self, module: nn.Module):`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`super().__init__(module)`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00
			`def forward(self, args, *kwargs):`
[npu] add npu support for hybrid plugin and llama (#5090) * llama 3d * update * fix autocast 2023-11-22 11:23:21 +00:00			`with autocast():`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`return self.module(args, *kwargs)`


			`class FP16TorchMixedPrecision(MixedPrecision):`
			`"""`
			`Precision for mixed precision training in FP16 using PyTorch AMP.`

			`Args:`
			`init_scale (float): Initial scale factor. Default: 2**16.`
			`growth_factor (float): Factor by which the scale is multiplied during`
			:meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be finite
			`this iteration. Default: 2.0.`
			`backoff_factor (float): Factor by which the scale is multiplied during`
			:meth:`torch.cuda.amp.GradScaler.step` if gradients were found to be infinite
			`this iteration. Default: 0.5.`
			growth_interval (int): Number of iterations between :meth:`torch.cuda.amp.GradScaler.step`
			`calls that may cause the scale to increase. Default: 2000.`
			`"""`

[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`def __init__(`
			`self,`
			`init_scale: float = 2.0**16,`
			`growth_factor: float = 2.0,`
			`backoff_factor: float = 0.5,`
			`growth_interval: int = 2000,`
			`) -> None:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`super().__init__()`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 2023-09-19 06:20:26 +00:00			`self.torch_amp_kwargs = dict(`
			`init_scale=init_scale,`
			`growth_factor=growth_factor,`
			`backoff_factor=backoff_factor,`
			`growth_interval=growth_interval,`
			`)`

			`def configure(`
			`self,`
			`model: nn.Module,`
			`optimizer: Optional[Optimizer] = None,`
			`criterion: Optional[Callable] = None,`
			`) -> Tuple[nn.Module, OptimizerWrapper, Callable]:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`model = TorchAMPModule(model)`
[booster] make optimizer argument optional for boost (#3993) * feat: make optimizer optional in Booster.boost * test: skip unet test if diffusers version > 0.10.2 2023-06-15 09:38:42 +00:00			`if optimizer is not None:`
			`optimizer = TorchAMPOptimizer(optimizer, **self.torch_amp_kwargs)`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`if criterion is not None:`
			`criterion = TorchAMPModule(criterion)`
			`return model, optimizer, criterion`