ColossalAI/colossalai/nn/loss/__init__.py

from colossalai.global_variables import tensor_parallel_env as env
from colossalai.nn.layer.utils import get_tensor_parallel_mode
from torch import nn
from torch.nn.modules.loss import *
from torch.nn.modules.loss import _Loss

from .loss_1d import VocabParallelCrossEntropyLoss1D
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
from .loss_moe import MoeCrossEntropyLoss, MoeLoss

_parallel_cross_entropy = {
    '2d': CrossEntropyLoss2D,
    '2.5d': CrossEntropyLoss2p5D,
    '3d': CrossEntropyLoss3D,
}

_vocab_parallel_cross_entropy = {
    '1d': VocabParallelCrossEntropyLoss1D,
    '2d': VocabParallelCrossEntropyLoss2D,
    '2.5d': VocabParallelCrossEntropyLoss2p5D,
    '3d': VocabParallelCrossEntropyLoss3D,
}


class CrossEntropyLoss(_Loss):

    def __init__(self, reduction: bool = True, *args, **kwargs):
        super().__init__()
        tensor_parallel = get_tensor_parallel_mode()
        if tensor_parallel is not None and env.vocab_parallel:
            self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
        elif tensor_parallel is None or tensor_parallel == '1d':
            reduction = 'mean' if reduction else 'none'
            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
        else:
            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)

    def forward(self, *args):
        return self.loss(*args)
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago			`from colossalai.global_variables import tensor_parallel_env as env`
			`from colossalai.nn.layer.utils import get_tensor_parallel_mode`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`from torch import nn`
			`from torch.nn.modules.loss import *`
			`from torch.nn.modules.loss import _Loss`
Migrated project 3 years ago
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago			`from .loss_1d import VocabParallelCrossEntropyLoss1D`
			`from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D`
			`from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D`
			`from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D`
Added MoE parallel (#127) 3 years ago			`from .loss_moe import MoeCrossEntropyLoss, MoeLoss`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago
			`_parallel_cross_entropy = {`
			`'2d': CrossEntropyLoss2D,`
			`'2.5d': CrossEntropyLoss2p5D,`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago			`'3d': CrossEntropyLoss3D,`
			`}`

			`_vocab_parallel_cross_entropy = {`
			`'1d': VocabParallelCrossEntropyLoss1D,`
			`'2d': VocabParallelCrossEntropyLoss2D,`
			`'2.5d': VocabParallelCrossEntropyLoss2p5D,`
			`'3d': VocabParallelCrossEntropyLoss3D,`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`}`


			`class CrossEntropyLoss(_Loss):`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago
Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`def __init__(self, reduction: bool = True, args, *kwargs):`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`super().__init__()`
Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`tensor_parallel = get_tensor_parallel_mode()`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 3 years ago			`if tensor_parallel is not None and env.vocab_parallel:`
			`self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, args, *kwargs)`
			`elif tensor_parallel is None or tensor_parallel == '1d':`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`reduction = 'mean' if reduction else 'none'`
			`self.loss = nn.CrossEntropyLoss(reduction=reduction, args, *kwargs)`
			`else:`
			`self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, args, *kwargs)`

			`def forward(self, *args):`
			`return self.loss(*args)`