ColossalAI/colossalai/nn/init.py

import math
import warnings

from torch import Tensor
import torch.nn as nn


def zeros_():
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.zeros_(tensor)

    return initializer


def ones_():
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.ones_(tensor)

    return initializer


def uniform_(a: float = 0., b: float = 1.):
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.uniform_(tensor, a, b)

    return initializer


def normal_(mean: float = 0., std: float = 1.):
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.normal_(tensor, mean, std)

    return initializer


def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.):
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        return nn.init.trunc_normal_(tensor, mean, std, a, b)

    return initializer


def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        if 0 in tensor.shape:
            warnings.warn("Initializing zero-element tensors is a no-op")
            return tensor

        if mode == 'fan_in':
            assert fan_in is not None, 'Fan_in is not provided.'
            fan = fan_in
        elif mode == 'fan_out':
            assert fan_out is not None, 'Fan_out is not provided.'
            fan = fan_out
        else:
            raise ValueError(f'Invalid initialization mode \'{mode}\'')

        std = nn.init.calculate_gain(nonlinearity, a) / math.sqrt(fan)
        bound = math.sqrt(3.) * std
        return nn.init.uniform_(tensor, -bound, bound)

    return initializer


def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        if 0 in tensor.shape:
            warnings.warn("Initializing zero-element tensors is a no-op")
            return tensor

        if mode == 'fan_in':
            assert fan_in is not None, 'Fan_in is not provided.'
            fan = fan_in
        elif mode == 'fan_out':
            assert fan_out is not None, 'Fan_out is not provided.'
            fan = fan_out
        else:
            raise ValueError(f'Invalid initialization mode \'{mode}\'')

        std = nn.init.calculate_gain(nonlinearity, a) / math.sqrt(fan)
        return nn.init.normal_(tensor, 0, std)

    return initializer


def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.):
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'

        fan = fan_in
        if fan_out is not None:
            fan += fan_out

        std = gain * math.sqrt(scale / float(fan))
        bound = a * std
        return nn.init.uniform_(tensor, -bound, bound)

    return initializer


def xavier_normal_(scale: float = 2., gain: float = 1.):
    # adapted from torch.nn.init
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'

        fan = fan_in
        if fan_out is not None:
            fan += fan_out

        std = gain * math.sqrt(scale / float(fan))

        return nn.init.normal_(tensor, 0., std)

    return initializer


def lecun_uniform_():
    # adapted from jax.nn.initializers
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'

        var = 1.0 / fan_in
        bound = math.sqrt(3 * var)
        return nn.init.uniform_(tensor, -bound, bound)

    return initializer


def lecun_normal_():
    # adapted from jax.nn.initializers
    def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):
        assert fan_in is not None, 'Fan_in is not provided.'

        std = math.sqrt(1.0 / fan_in)
        return nn.init.trunc_normal_(tensor, std=std / .87962566103423978)

    return initializer
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`import math`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`import warnings`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago
			`from torch import Tensor`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`import torch.nn as nn`


			`def zeros_():`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`return nn.init.zeros_(tensor)`

			`return initializer`


			`def ones_():`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`return nn.init.ones_(tensor)`

			`return initializer`


			`def uniform_(a: float = 0., b: float = 1.):`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`return nn.init.uniform_(tensor, a, b)`

			`return initializer`


			`def normal_(mean: float = 0., std: float = 1.):`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`return nn.init.normal_(tensor, mean, std)`

			`return initializer`


			`def trunc_normal_(mean: float = 0., std: float = 1., a: float = -2., b: float = 2.):`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`return nn.init.trunc_normal_(tensor, mean, std, a, b)`

			`return initializer`


			`def kaiming_uniform_(a=0, mode='fan_in', nonlinearity='leaky_relu'):`
			`# adapted from torch.nn.init`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`if 0 in tensor.shape:`
			`warnings.warn("Initializing zero-element tensors is a no-op")`
			`return tensor`

			`if mode == 'fan_in':`
			`assert fan_in is not None, 'Fan_in is not provided.'`
			`fan = fan_in`
			`elif mode == 'fan_out':`
			`assert fan_out is not None, 'Fan_out is not provided.'`
			`fan = fan_out`
			`else:`
			`raise ValueError(f'Invalid initialization mode \'{mode}\'')`

			`std = nn.init.calculate_gain(nonlinearity, a) / math.sqrt(fan)`
			`bound = math.sqrt(3.) * std`
			`return nn.init.uniform_(tensor, -bound, bound)`

			`return initializer`


			`def kaiming_normal_(a=0, mode='fan_in', nonlinearity='leaky_relu'):`
			`# adapted from torch.nn.init`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`if 0 in tensor.shape:`
			`warnings.warn("Initializing zero-element tensors is a no-op")`
			`return tensor`

			`if mode == 'fan_in':`
			`assert fan_in is not None, 'Fan_in is not provided.'`
			`fan = fan_in`
			`elif mode == 'fan_out':`
			`assert fan_out is not None, 'Fan_out is not provided.'`
			`fan = fan_out`
			`else:`
			`raise ValueError(f'Invalid initialization mode \'{mode}\'')`

			`std = nn.init.calculate_gain(nonlinearity, a) / math.sqrt(fan)`
			`return nn.init.normal_(tensor, 0, std)`

			`return initializer`


			`def xavier_uniform_(a: float = math.sqrt(3.), scale: float = 2., gain: float = 1.):`
			`# adapted from torch.nn.init`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`assert fan_in is not None, 'Fan_in is not provided.'`

			`fan = fan_in`
			`if fan_out is not None:`
			`fan += fan_out`

			`std = gain * math.sqrt(scale / float(fan))`
			`bound = a * std`
			`return nn.init.uniform_(tensor, -bound, bound)`

			`return initializer`


			`def xavier_normal_(scale: float = 2., gain: float = 1.):`
			`# adapted from torch.nn.init`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`assert fan_in is not None, 'Fan_in is not provided.'`

			`fan = fan_in`
			`if fan_out is not None:`
			`fan += fan_out`

			`std = gain * math.sqrt(scale / float(fan))`

			`return nn.init.normal_(tensor, 0., std)`

			`return initializer`


			`def lecun_uniform_():`
			`# adapted from jax.nn.initializers`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`assert fan_in is not None, 'Fan_in is not provided.'`

			`var = 1.0 / fan_in`
			`bound = math.sqrt(3 * var)`
			`return nn.init.uniform_(tensor, -bound, bound)`

			`return initializer`


			`def lecun_normal_():`
			`# adapted from jax.nn.initializers`
			`def initializer(tensor: Tensor, fan_in: int = None, fan_out: int = None):`
			`assert fan_in is not None, 'Fan_in is not provided.'`

Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`std = math.sqrt(1.0 / fan_in)`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`return nn.init.trunc_normal_(tensor, std=std / .87962566103423978)`

			`return initializer`