ColossalAI/colossalai/nn/layer/layernorm.py

"""This code is from NVIDIA apex:
      https://github.com/NVIDIA/apex
   with some changes. """

import numbers

import torch
from torch.cuda.amp import custom_bwd, custom_fwd
from torch.nn import init
from torch.nn.parameter import Parameter

from colossalai.kernel.kernel_loader import LayerNormLoader

try:
    from colossalai._C import layer_norm
except ImportError:
    layer_norm = None


class FusedLayerNormAffineFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx, input, weight, bias, normalized_shape, eps):
        ctx.normalized_shape = normalized_shape
        ctx.eps = eps
        input_ = input.contiguous()
        weight_ = weight.contiguous()
        bias_ = bias.contiguous()

        global layer_norm
        if layer_norm is None:
            layer_norm = LayerNormLoader().load()
        output, mean, invvar = layer_norm.forward_affine(input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
        ctx.layernorm_op = layer_norm
        ctx.save_for_backward(input_, weight_, bias_, mean, invvar)

        return output

    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output):
        input_, weight_, bias_, mean, invvar = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None
        grad_input, grad_weight, grad_bias = layer_norm.backward_affine(
            grad_output.contiguous(), mean, invvar, input_, ctx.normalized_shape, weight_, bias_, ctx.eps
        )

        return grad_input, grad_weight, grad_bias, None, None


class MixedFusedLayerNorm(torch.nn.Module):
    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None):
        super(MixedFusedLayerNorm, self).__init__()

        if isinstance(normalized_shape, numbers.Integral):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = torch.Size(normalized_shape)
        self.eps = eps
        self.weight = Parameter(torch.empty(*normalized_shape, device=device, dtype=dtype))
        self.bias = Parameter(torch.empty(*normalized_shape, device=device, dtype=dtype))
        self.reset_parameters()

    def reset_parameters(self):
        init.ones_(self.weight)
        init.zeros_(self.bias)

    def forward(self, input):
        return FusedLayerNormAffineFunction.apply(input, self.weight, self.bias, self.normalized_shape, self.eps)

    def __repr__(self):
        return f"MixedFusedLayerNorm(normalized_shape={self.normalized_shape}, eps={self.eps})"
add colossalai kernel module (#55) 3 years ago			`"""This code is from NVIDIA apex:`
			`https://github.com/NVIDIA/apex`
			`with some changes. """`

			`import numbers`
[kernel] move all symlinks of kernel to `colossalai._C` (#1971) 2 years ago
add colossalai kernel module (#55) 3 years ago			`import torch`
[kernel] move all symlinks of kernel to `colossalai._C` (#1971) 2 years ago			`from torch.cuda.amp import custom_bwd, custom_fwd`
add colossalai kernel module (#55) 3 years ago			`from torch.nn import init`
[kernel] move all symlinks of kernel to `colossalai._C` (#1971) 2 years ago			`from torch.nn.parameter import Parameter`
add colossalai kernel module (#55) 3 years ago
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`from colossalai.kernel.kernel_loader import LayerNormLoader`
[kernel] fixed repeated loading of kernels (#2549) * [kernel] fixed repeated loading of kernels * polish code * polish code 2 years ago
			`try:`
			`from colossalai._C import layer_norm`
			`except ImportError:`
			`layer_norm = None`

add colossalai kernel module (#55) 3 years ago
			`class FusedLayerNormAffineFunction(torch.autograd.Function):`
			`@staticmethod`
adapted for sequence parallel (#163) 3 years ago			`@custom_fwd(cast_inputs=torch.float32)`
add colossalai kernel module (#55) 3 years ago			`def forward(ctx, input, weight, bias, normalized_shape, eps):`
			`ctx.normalized_shape = normalized_shape`
			`ctx.eps = eps`
			`input_ = input.contiguous()`
			`weight_ = weight.contiguous()`
			`bias_ = bias.contiguous()`
[kernel] fixed repeated loading of kernels (#2549) * [kernel] fixed repeated loading of kernels * polish code * polish code 2 years ago
			`global layer_norm`
			`if layer_norm is None:`
[feat] refactored extension module (#5298) * [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 10 months ago			`layer_norm = LayerNormLoader().load()`
[hotfix] issue #2388 2 years ago			`output, mean, invvar = layer_norm.forward_affine(input_, ctx.normalized_shape, weight_, bias_, ctx.eps)`
[kernel] fixed repeated loading of kernels (#2549) * [kernel] fixed repeated loading of kernels * polish code * polish code 2 years ago			`ctx.layernorm_op = layer_norm`
add colossalai kernel module (#55) 3 years ago			`ctx.save_for_backward(input_, weight_, bias_, mean, invvar)`

			`return output`

			`@staticmethod`
adapted for sequence parallel (#163) 3 years ago			`@custom_bwd`
add colossalai kernel module (#55) 3 years ago			`def backward(ctx, grad_output):`
			`input_, weight_, bias_, mean, invvar = ctx.saved_tensors`
			`grad_input = grad_weight = grad_bias = None`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`grad_input, grad_weight, grad_bias = layer_norm.backward_affine(`
			`grad_output.contiguous(), mean, invvar, input_, ctx.normalized_shape, weight_, bias_, ctx.eps`
			`)`
add colossalai kernel module (#55) 3 years ago
			`return grad_input, grad_weight, grad_bias, None, None`


			`class MixedFusedLayerNorm(torch.nn.Module):`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago			`def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None):`
add colossalai kernel module (#55) 3 years ago			`super(MixedFusedLayerNorm, self).__init__()`

			`if isinstance(normalized_shape, numbers.Integral):`
			`normalized_shape = (normalized_shape,)`
			`self.normalized_shape = torch.Size(normalized_shape)`
			`self.eps = eps`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago			`self.weight = Parameter(torch.empty(*normalized_shape, device=device, dtype=dtype))`
			`self.bias = Parameter(torch.empty(*normalized_shape, device=device, dtype=dtype))`
add colossalai kernel module (#55) 3 years ago			`self.reset_parameters()`

			`def reset_parameters(self):`
			`init.ones_(self.weight)`
			`init.zeros_(self.bias)`

			`def forward(self, input):`
[NFC] polish colossalai/kernel/cuda_native/layer_norm.py code style (#980) 3 years ago			`return FusedLayerNormAffineFunction.apply(input, self.weight, self.bias, self.normalized_shape, self.eps)`
adapted for sequence parallel (#163) 3 years ago
			`def __repr__(self):`
[misc] update pre-commit and run all files (#4752) * [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format 1 year ago			`return f"MixedFusedLayerNorm(normalized_shape={self.normalized_shape}, eps={self.eps})"`