ColossalAI/colossalai/shardformer/layer/dropout.py

from typing import List, Union

import torch
import torch.nn as nn
from torch.distributed import ProcessGroup

from .parallel_module import ParallelModule
from .utils import create_randomizer_with_offset

__all__ = ['Dropout1D']


class Dropout1D(ParallelModule, nn.Dropout):
    """
    The Dropout Layer will apply dropout mask to the input tensor. The dropout mask is generated with
    randomness on different ranks of the given process group. This can avoid the same dropout mask is generated
    and applied on the same position of different ranks, leading to poor convergence performance.

    Args:
        p (float): probability of an element to be zeroed. Defaults to 0.5.
        inplace (bool): If set to True, will do this operation in-place. Defaults to False.
        process_group (ProcessGroup): the process group to be used for generating randomness. Defaults to None.
    """

    def __init__(self, p: float = 0.5, inplace: bool = False, process_group: ProcessGroup = None):
        # init with nn.Dropout
        super(nn.Dropout, self).__init__(p=p, inplace=inplace)

        # offset the seed with randomizer index and rank
        seed = torch.random.initial_seed()
        self.randomizer = create_randomizer_with_offset(seed, process_group=process_group)

    @staticmethod
    def from_native_module(module: nn.Dropout,
                           process_group: Union[ProcessGroup, List[ProcessGroup]] = None) -> "Dropout1D":
        """
        Create a Dropout1D layer from a native dropout layer.
        """
        p = module.p
        inplace = module.inplace
        return Dropout1D(p=p, inplace=inplace, process_group=process_group)

    def forward(self, input):
        with self.randomizer.fork_rng():
            input = super().forward(input)
        return input