[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
pull/4692/head
Hongxin Liu 2023-09-11 16:24:28 +08:00 committed by GitHub
parent 536397cc95
commit 554aa9592e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
170 changed files with 781 additions and 758 deletions

View File

@ -4,7 +4,7 @@ from typing import Optional, Set
import torch import torch
import torch.nn as nn import torch.nn as nn
from colossalai.nn.parallel.data_parallel import _cast_float from colossalai.utils import _cast_float
from colossalai.zero.legacy.gemini.tensor_utils import free_storage from colossalai.zero.legacy.gemini.tensor_utils import free_storage
from .region_manager import RegionManager from .region_manager import RegionManager

View File

@ -1,5 +1,4 @@
class Registry: class Registry:
# TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name

View File

@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.optim import Optimizer from torch.optim import Optimizer
from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
from colossalai.interface import ModelWrapper, OptimizerWrapper from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.nn.optimizer import ColossalaiOptimizer from colossalai.nn.optimizer import ColossalaiOptimizer
@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
checkpoint_path (str): Path to the checkpoint directory. checkpoint_path (str): Path to the checkpoint directory.
is_master (bool): Whether current rank is main process. is_master (bool): Whether current rank is main process.
""" """
try:
from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
except ImportError:
return
if not isinstance(model, PreTrainedModel): if not isinstance(model, PreTrainedModel):
return return

View File

@ -1,6 +1,6 @@
import torch import torch
import colossalai.nn as col_nn import colossalai.legacy.nn as col_nn
class MLP(torch.nn.Module): class MLP(torch.nn.Module):

View File

@ -1,6 +1,6 @@
import torch import torch
from colossalai.nn.layer.colossalai_layer import Embedding, Linear from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from .bias_dropout_add import bias_dropout_add_fused_train from .bias_dropout_add import bias_dropout_add_fused_train

View File

@ -1,9 +1,17 @@
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter
from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward, from .p2p import (
send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward, recv_backward,
recv_forward, recv_backward) recv_forward,
send_backward,
send_backward_recv_backward,
send_backward_recv_forward,
send_forward,
send_forward_backward_recv_forward_backward,
send_forward_recv_backward,
send_forward_recv_forward,
)
from .ring import ring_forward from .ring import ring_forward
from .utils import send_obj_meta, recv_obj_meta from .utils import recv_obj_meta, send_obj_meta
__all__ = [ __all__ = [
'all_gather', 'all_gather',

View File

@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union
import torch.cuda import torch.cuda
import colossalai.communication as comm import colossalai.legacy.communication as comm
from colossalai.amp.naive_amp import NaiveAMPModel from colossalai.amp.naive_amp import NaiveAMPModel
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc

View File

@ -5,10 +5,10 @@ from typing import Iterable, Tuple
import torch.cuda import torch.cuda
import colossalai.communication.p2p_v2 as comm import colossalai.legacy.communication.p2p_v2 as comm
from colossalai import engine
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.engine import Engine
from colossalai.utils.cuda import get_current_device from colossalai.utils.cuda import get_current_device
from ._pipeline_schedule import PipelineSchedule from ._pipeline_schedule import PipelineSchedule
@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule):
""" """
def forward_backward_step(self, def forward_backward_step(self,
engine: engine.Engine, engine: Engine,
data_iter: Iterable, data_iter: Iterable,
forward_only=False, forward_only=False,
return_loss=True, return_loss=True,

View File

@ -0,0 +1,4 @@
from ._ops import *
from .layer import *
from .loss import *
from .metric import *

View File

@ -4,7 +4,7 @@ import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.nn.layer.utils import divide from colossalai.legacy.nn.layer.utils import divide
from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
GeneralTensor = Union[ColoTensor, torch.Tensor] GeneralTensor = Union[ColoTensor, torch.Tensor]
@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim) return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
### table wise embedding shard # table wise embedding shard
def _all_to_all_for_tablewise(x: torch.Tensor, def _all_to_all_for_tablewise(x: torch.Tensor,

View File

@ -1,8 +1,10 @@
import torch.nn.functional as F
from typing import Optional from typing import Optional
import torch.nn.functional as F
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input

View File

@ -1,9 +1,11 @@
import torch.nn.functional as F
from typing import Optional from typing import Optional
import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
ShardSpec, ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor from ._utils import GeneralTensor, convert_to_colo_tensor

View File

@ -1,7 +1,10 @@
from typing import List, Optional from typing import List, Optional
import torch.nn.functional as F import torch.nn.functional as F
from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor from ._utils import GeneralTensor, convert_to_colo_tensor

View File

@ -1,9 +1,12 @@
from typing import Optional
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from typing import Optional
from colossalai.tensor.op_wrapper import colo_op_impl from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
from colossalai.tensor import ColoTensor, ColoTensorSpec from colossalai.tensor import ColoTensor, ColoTensorSpec
from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D from colossalai.tensor.op_wrapper import colo_op_impl
from ._utils import GeneralTensor, convert_to_colo_tensor from ._utils import GeneralTensor, convert_to_colo_tensor

View File

@ -0,0 +1,9 @@
from .colossalai_layer import *
from .parallel_1d import *
from .parallel_2d import *
from .parallel_2p5d import *
from .parallel_3d import *
from .parallel_sequence import *
from .utils import *
from .vanilla import *
from .wrapper import *

View File

@ -1,7 +1,7 @@
from ._utils import partition_batch from ._utils import partition_batch
from .dropout import Dropout from .dropout import Dropout
from .embedding import Embedding, PatchEmbedding from .embedding import Embedding, PatchEmbedding
from .linear import Classifier, Linear from .linear import Classifier, Linear
from .normalization import LayerNorm from .normalization import LayerNorm
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch'] __all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']

View File

@ -1,151 +1,152 @@
import math import math
from typing import Callable from typing import Callable
from colossalai.utils import get_current_device from torch import dtype, nn
from torch import dtype, nn
from colossalai.nn import init
from ... import init as init from colossalai.utils import get_current_device
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
from ..utils import get_tensor_parallel_mode from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
from ..vanilla import VanillaPatchEmbedding from ..utils import get_tensor_parallel_mode
from ._utils import ColossalaiModule from ..vanilla import VanillaPatchEmbedding
from ._utils import ColossalaiModule
_parallel_embedding = {
'1d': Embedding1D, _parallel_embedding = {
'2d': Embedding2D, '1d': Embedding1D,
'2.5d': Embedding2p5D, '2d': Embedding2D,
'3d': Embedding3D, '2.5d': Embedding2p5D,
} '3d': Embedding3D,
}
_vocab_parallel_embedding = {
'1d': VocabParallelEmbedding1D, _vocab_parallel_embedding = {
'2d': VocabParallelEmbedding2D, '1d': VocabParallelEmbedding1D,
'2.5d': VocabParallelEmbedding2p5D, '2d': VocabParallelEmbedding2D,
'3d': VocabParallelEmbedding3D '2.5d': VocabParallelEmbedding2p5D,
} '3d': VocabParallelEmbedding3D
}
_parallel_patchembedding = {
None: VanillaPatchEmbedding, _parallel_patchembedding = {
'1d': PatchEmbedding1D, None: VanillaPatchEmbedding,
'2d': PatchEmbedding2D, '1d': PatchEmbedding1D,
'2.5d': PatchEmbedding2p5D, '2d': PatchEmbedding2D,
'3d': PatchEmbedding3D '2.5d': PatchEmbedding2p5D,
} '3d': PatchEmbedding3D
}
class Embedding(ColossalaiModule):
r"""Embedding for colossalai. class Embedding(ColossalaiModule):
r"""Embedding for colossalai.
Args:
num_embeddings (int): number of embeddings. Args:
embedding_dim (int): dimension of embedding. num_embeddings (int): number of embeddings.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; embedding_dim (int): dimension of embedding.
therefore, the embedding vector at padding_idx is not updated during training, padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
i.e. it remains as a fixed pad, defaults to None. therefore, the embedding vector at padding_idx is not updated during training,
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. i.e. it remains as a fixed pad, defaults to None.
weight_initializer (:class:`typing.Callable`, optional): dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
he initializer of weight, defaults to normal initializer. weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
:: The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place. max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2. renormalized to have norm max_norm. Note: this will modify weight in-place.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
of frequency of the words in the mini-batch. Default False. scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False. of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_. More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_ More details about ``initializer`` please refer to
""" `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def __init__(self,
num_embeddings: int, def __init__(self,
embedding_dim: int, num_embeddings: int,
padding_idx: int = None, embedding_dim: int,
dtype: dtype = None, padding_idx: int = None,
weight_initializer: Callable = init.normal_(), dtype: dtype = None,
vocab_parallel_limit: int = 2048, weight_initializer: Callable = init.normal_(),
*args, vocab_parallel_limit: int = 2048,
**kwargs) -> None: *args,
tensor_parallel = get_tensor_parallel_mode() **kwargs) -> None:
if tensor_parallel is None: tensor_parallel = get_tensor_parallel_mode()
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args, if tensor_parallel is None:
**kwargs).to(dtype).to(get_current_device()) embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim) **kwargs).to(dtype).to(get_current_device())
elif num_embeddings <= vocab_parallel_limit: weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
embed = _parallel_embedding[tensor_parallel]( elif num_embeddings <= vocab_parallel_limit:
num_embeddings, embed = _parallel_embedding[tensor_parallel](
embedding_dim, num_embeddings,
padding_idx=padding_idx, embedding_dim,
dtype=dtype, padding_idx=padding_idx,
weight_initializer=weight_initializer, dtype=dtype,
*args, weight_initializer=weight_initializer,
**kwargs, *args,
) **kwargs,
else: )
embed = _vocab_parallel_embedding[tensor_parallel]( else:
num_embeddings, embed = _vocab_parallel_embedding[tensor_parallel](
embedding_dim, num_embeddings,
padding_idx=padding_idx, embedding_dim,
dtype=dtype, padding_idx=padding_idx,
weight_initializer=weight_initializer, dtype=dtype,
*args, weight_initializer=weight_initializer,
**kwargs, *args,
) **kwargs,
super().__init__(embed) )
super().__init__(embed)
class PatchEmbedding(ColossalaiModule):
"""2D Image to Patch Embedding. class PatchEmbedding(ColossalaiModule):
"""2D Image to Patch Embedding.
Args:
img_size (int): image size. Args:
patch_size (int): patch size. img_size (int): image size.
in_chans (int): number of channels of input image. patch_size (int): patch size.
embed_size (int): size of embedding. in_chans (int): number of channels of input image.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. embed_size (int): size of embedding.
flatten (bool, optional): whether to flatten output tensor, defaults to True. dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional): flatten (bool, optional): whether to flatten output tensor, defaults to True.
The initializer of weight, defaults to kaiming uniform initializer. weight_initializer (:class:`typing.Callable`, optional):
bias_initializer (:class:`typing.Callable`, optional): The initializer of weight, defaults to kaiming uniform initializer.
The initializer of bias, defaults to xavier uniform initializer. bias_initializer (:class:`typing.Callable`, optional):
position_embed_initializer (:class:`typing.Callable`, optional): The initializer of bias, defaults to xavier uniform initializer.
The initializer of position embedding, defaults to zeros initializer. position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_. More details about ``initializer`` please refer to
""" `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def __init__(
self, def __init__(
img_size: int, self,
patch_size: int, img_size: int,
in_chans: int, patch_size: int,
embed_size: int, in_chans: int,
dtype: dtype = None, embed_size: int,
flatten: bool = True, dtype: dtype = None,
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), flatten: bool = True,
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
position_embed_initializer: Callable = init.zeros_() bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
) -> None: position_embed_initializer: Callable = init.zeros_()
tensor_parallel = get_tensor_parallel_mode() ) -> None:
embed = _parallel_patchembedding[tensor_parallel]( tensor_parallel = get_tensor_parallel_mode()
img_size, embed = _parallel_patchembedding[tensor_parallel](
patch_size, img_size,
in_chans, patch_size,
embed_size, in_chans,
dtype=dtype, embed_size,
flatten=flatten, dtype=dtype,
weight_initializer=weight_initializer, flatten=flatten,
bias_initializer=bias_initializer, weight_initializer=weight_initializer,
position_embed_initializer=position_embed_initializer, bias_initializer=bias_initializer,
) position_embed_initializer=position_embed_initializer,
super().__init__(embed) )
super().__init__(embed)

View File

@ -4,9 +4,9 @@ from typing import Callable
from torch import dtype, nn from torch import dtype, nn
from colossalai.nn import init
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from ... import init as init
from ..parallel_1d import * from ..parallel_1d import *
from ..parallel_2d import * from ..parallel_2d import *
from ..parallel_2p5d import * from ..parallel_2p5d import *

View File

@ -1,41 +1,42 @@
from colossalai.utils import get_current_device from torch import nn
from torch import nn
from colossalai.utils import get_current_device
from ..parallel_1d import LayerNorm1D
from ..parallel_2d import LayerNorm2D from ..parallel_1d import LayerNorm1D
from ..parallel_2p5d import LayerNorm2p5D from ..parallel_2d import LayerNorm2D
from ..parallel_3d import LayerNorm3D from ..parallel_2p5d import LayerNorm2p5D
from ..utils import get_tensor_parallel_mode from ..parallel_3d import LayerNorm3D
from ..vanilla import VanillaLayerNorm from ..utils import get_tensor_parallel_mode
from ._utils import ColossalaiModule from ..vanilla import VanillaLayerNorm
from ._utils import ColossalaiModule
_parallel_layernorm = {
None: VanillaLayerNorm, _parallel_layernorm = {
"1d": LayerNorm1D, None: VanillaLayerNorm,
"2d": LayerNorm2D, "1d": LayerNorm1D,
"2.5d": LayerNorm2p5D, "2d": LayerNorm2D,
"3d": LayerNorm3D, "2.5d": LayerNorm2p5D,
} "3d": LayerNorm3D,
}
class LayerNorm(ColossalaiModule):
r"""Layer Normalization for colossalai. class LayerNorm(ColossalaiModule):
r"""Layer Normalization for colossalai.
Args:
normalized_shape (int): input shape from an expected input of size. Args:
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] normalized_shape (int): input shape from an expected input of size.
\times \ldots \times \text{normalized_shape}[-1]]` :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
If a single integer is used, it is treated as a singleton list, and this module will \times \ldots \times \text{normalized_shape}[-1]]`
normalize over the last dimension which is expected to be of that specific size. If a single integer is used, it is treated as a singleton list, and this module will
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05. normalize over the last dimension which is expected to be of that specific size.
bias (bool, optional): Whether to add a bias, defaults to ``True``. eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. bias (bool, optional): Whether to add a bias, defaults to ``True``.
""" dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
tensor_parallel = get_tensor_parallel_mode() def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
if tensor_parallel is None: tensor_parallel = get_tensor_parallel_mode()
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device()) if tensor_parallel is None:
else: norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype) else:
super().__init__(norm) norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
super().__init__(norm)

View File

@ -0,0 +1,17 @@
from .layers import (
Classifier1D,
Dropout1D,
Embedding1D,
LayerNorm1D,
Linear1D,
Linear1D_Col,
Linear1D_Row,
PatchEmbedding1D,
VocabParallelClassifier1D,
VocabParallelEmbedding1D,
)
__all__ = [
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
]

View File

@ -3,6 +3,7 @@
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
class _SplitForwardGatherBackward(torch.autograd.Function): class _SplitForwardGatherBackward(torch.autograd.Function):
""" """
Split the input and keep only the corresponding chuck to the rank. Split the input and keep only the corresponding chuck to the rank.
Args: Args:
input_: input matrix. input_: input matrix.
parallel_mode: parallel mode. parallel_mode: parallel mode.

View File

@ -10,11 +10,11 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.kernel import LayerNorm from colossalai.kernel import LayerNorm
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.utils.checkpointing import ( from colossalai.utils.checkpointing import (

View File

@ -1,6 +1,13 @@
from ._operation import reduce_by_batch_2d, split_batch_2d from ._operation import reduce_by_batch_2d, split_batch_2d
from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D, from .layers import (
VocabParallelEmbedding2D) Classifier2D,
Embedding2D,
LayerNorm2D,
Linear2D,
PatchEmbedding2D,
VocabParallelClassifier2D,
VocabParallelEmbedding2D,
)
__all__ = [ __all__ = [
'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D', 'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',

View File

@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.utils import get_current_device
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
from colossalai.utils import get_current_device
def matmul_2d( def matmul_2d(
@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode) col_group = gpc.get_group(col_parallel_mode)
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
opa = [None] * 2 opa = [None] * 2
opb = [None] * 2 opb = [None] * 2
@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode) col_group = gpc.get_group(col_parallel_mode)
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
opb = [None] * 2 opb = [None] * 2
opr = [None] * 2 opr = [None] * 2
@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode) col_group = gpc.get_group(col_parallel_mode)
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size pipeline_parallel_rank * tensor_parallel_size
opa = [None] * 2 opa = [None] * 2
opr = [None] * 2 opr = [None] * 2

View File

@ -8,10 +8,10 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn import Parameter from torch.nn import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict

View File

@ -1,6 +1,13 @@
from ._operation import reduce_by_batch_2p5d, split_batch_2p5d from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D, from .layers import (
VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D) Classifier2p5D,
Embedding2p5D,
LayerNorm2p5D,
Linear2p5D,
PatchEmbedding2p5D,
VocabParallelClassifier2p5D,
VocabParallelEmbedding2p5D,
)
__all__ = [ __all__ = [
'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D', 'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',

View File

@ -2,13 +2,14 @@ from typing import Any, Tuple
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.utils import get_current_device
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
from colossalai.utils import get_current_device
def get_parallel_group(parallel_mode: ParallelMode): def get_parallel_group(parallel_mode: ParallelMode):
return gpc.get_group(parallel_mode) return gpc.get_group(parallel_mode)

View File

@ -8,10 +8,10 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn import Parameter from torch.nn import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.utils.checkpointing import ( from colossalai.utils.checkpointing import (

View File

@ -1,6 +1,13 @@
from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D, from .layers import (
VocabParallelEmbedding3D) Classifier3D,
Embedding3D,
LayerNorm3D,
Linear3D,
PatchEmbedding3D,
VocabParallelClassifier3D,
VocabParallelEmbedding3D,
)
__all__ = [ __all__ = [
'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D', 'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',

View File

@ -7,10 +7,10 @@ import torch
from torch import Tensor from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
from ._utils import get_parallel_mode_from_env, push_async_grad from ._utils import get_parallel_mode_from_env, push_async_grad

View File

@ -8,14 +8,14 @@ import torch.nn.functional as F
from torch import Tensor from torch import Tensor
from torch.nn import Parameter from torch.nn import Parameter
from colossalai.communication import all_reduce, broadcast
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
from colossalai.context import ParallelMode, seed from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication import all_reduce, broadcast
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init from colossalai.nn import init as init
from colossalai.nn.layer.base_layer import ParallelLayer
from colossalai.utils.checkpointing import ( from colossalai.utils.checkpointing import (
broadcast_state_dict, broadcast_state_dict,
gather_tensor_parallel_state_dict, gather_tensor_parallel_state_dict,

View File

@ -1,4 +1,4 @@
from ._operation import RingQK, RingAV from ._operation import RingAV, RingQK
from .layers import TransformerSelfAttentionRing from .layers import TransformerSelfAttentionRing
__all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK'] __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']

View File

@ -3,13 +3,13 @@
import torch import torch
from torch import distributed as dist from torch import distributed as dist
from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.communication import ring_forward
from colossalai.context.parallel_mode import ParallelMode from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range from colossalai.legacy.communication import ring_forward
from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
from colossalai.utils import get_current_device from colossalai.utils import get_current_device
from torch.cuda.amp import custom_bwd, custom_fwd
class RingQK(torch.autograd.Function): class RingQK(torch.autograd.Function):

View File

@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.kernel import FusedScaleMaskSoftmax from colossalai.kernel import FusedScaleMaskSoftmax
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
from colossalai.legacy.registry import LAYERS from colossalai.legacy.registry import LAYERS
from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
@LAYERS.register_module @LAYERS.register_module

View File

@ -0,0 +1,15 @@
from .common import (
ACT2FN,
CheckpointModule,
_ntuple,
divide,
get_tensor_parallel_mode,
set_tensor_parallel_attribute_by_partition,
set_tensor_parallel_attribute_by_size,
to_2tuple,
)
__all__ = [
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
]

View File

@ -6,10 +6,11 @@ from itertools import repeat
import numpy as np import numpy as np
import torch import torch
from torch import Tensor, nn
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
from colossalai.global_variables import tensor_parallel_env as env from colossalai.global_variables import tensor_parallel_env as env
from colossalai.utils import checkpoint from colossalai.utils import checkpoint
from torch import Tensor, nn
class CheckpointModule(nn.Module): class CheckpointModule(nn.Module):

View File

@ -1,6 +1,8 @@
import torch.nn as nn
import torch.distributed as dist
from typing import List, Tuple, Union from typing import List, Tuple, Union
import torch.distributed as dist
import torch.nn as nn
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc

View File

@ -0,0 +1,41 @@
from torch import nn
from torch.nn.modules.loss import *
from torch.nn.modules.loss import _Loss
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
from .loss_1d import VocabParallelCrossEntropyLoss1D
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
_parallel_cross_entropy = {
'2d': CrossEntropyLoss2D,
'2.5d': CrossEntropyLoss2p5D,
'3d': CrossEntropyLoss3D,
}
_vocab_parallel_cross_entropy = {
'1d': VocabParallelCrossEntropyLoss1D,
'2d': VocabParallelCrossEntropyLoss2D,
'2.5d': VocabParallelCrossEntropyLoss2p5D,
'3d': VocabParallelCrossEntropyLoss3D,
}
class CrossEntropyLoss(_Loss):
def __init__(self, reduction: bool = True, *args, **kwargs):
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is not None and env.vocab_parallel:
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
elif tensor_parallel is None or tensor_parallel == '1d':
reduction = 'mean' if reduction else 'none'
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
else:
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
def forward(self, *args):
return self.loss(*args)

View File

@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
from colossalai.utils import get_current_device from colossalai.utils import get_current_device

View File

@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
from colossalai.utils import get_current_device from colossalai.utils import get_current_device

View File

@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.legacy.registry import LOSSES from colossalai.legacy.registry import LOSSES
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from colossalai.utils import get_current_device from colossalai.utils import get_current_device

View File

@ -1,26 +1,28 @@
from torch import nn from torch import nn
from ._utils import calc_acc from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
from .accuracy_2d import Accuracy2D
from .accuracy_2p5d import Accuracy2p5D from ._utils import calc_acc
from .accuracy_3d import Accuracy3D from .accuracy_2d import Accuracy2D
from colossalai.nn.layer.utils import get_tensor_parallel_mode from .accuracy_2p5d import Accuracy2p5D
from .accuracy_3d import Accuracy3D
_parallel_accuracy = {
'2d': Accuracy2D, _parallel_accuracy = {
'2.5d': Accuracy2p5D, '2d': Accuracy2D,
'3d': Accuracy3D, '2.5d': Accuracy2p5D,
} '3d': Accuracy3D,
}
class Accuracy(nn.Module):
def __init__(self): class Accuracy(nn.Module):
super().__init__()
tensor_parallel = get_tensor_parallel_mode() def __init__(self):
if tensor_parallel not in _parallel_accuracy: super().__init__()
self.acc = calc_acc tensor_parallel = get_tensor_parallel_mode()
else: if tensor_parallel not in _parallel_accuracy:
self.acc = _parallel_accuracy[tensor_parallel]() self.acc = calc_acc
else:
def forward(self, *args): self.acc = _parallel_accuracy[tensor_parallel]()
return self.acc(*args)
def forward(self, *args):
return self.acc(*args)

View File

@ -1,7 +1,7 @@
import torch import torch
def calc_acc(logits, targets): def calc_acc(logits, targets):
preds = torch.argmax(logits, dim=-1) preds = torch.argmax(logits, dim=-1)
correct = torch.sum(targets == preds) correct = torch.sum(targets == preds)
return correct return correct

View File

@ -1,7 +1,8 @@
import torch import torch
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from torch import nn from torch import nn
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
from ._utils import calc_acc from ._utils import calc_acc

View File

@ -1,7 +1,8 @@
import torch import torch
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from torch import nn from torch import nn
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
from ._utils import calc_acc from ._utils import calc_acc

View File

@ -1,33 +1,35 @@
import torch import torch
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D from torch import nn
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
from torch import nn from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
from ._utils import calc_acc
from ._utils import calc_acc
class Accuracy3D(nn.Module):
"""Accuracy for 3D parallelism class Accuracy3D(nn.Module):
""" """Accuracy for 3D parallelism
def __init__(self): """
super().__init__()
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) def __init__(self):
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) super().__init__()
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
def forward(self, logits, targets): self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
"""Calculate the accuracy of predicted labels.
def forward(self, logits, targets):
Args: """Calculate the accuracy of predicted labels.
logits (:class:`torch.tensor`): Predicted labels.
targets (:class:`torch.tensor`): True labels from data. Args:
logits (:class:`torch.tensor`): Predicted labels.
Returns: targets (:class:`torch.tensor`): True labels from data.
float: the accuracy of prediction.
""" Returns:
with torch.no_grad(): float: the accuracy of prediction.
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) """
targets = split_tensor_3d(targets, 0, self.input_parallel_mode) with torch.no_grad():
correct = calc_acc(logits, targets) targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode) targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
return correct correct = calc_acc(logits, targets)
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
return correct

View File

@ -1,10 +1,17 @@
from .cache_embedding import (
CachedEmbeddingBag,
CachedParamMgr,
EvictionStrategy,
LimitBuffIndexCopyer,
ParallelCachedEmbeddingBag,
ParallelCachedEmbeddingBagTablewise,
ParallelCachedEmbeddingBagTablewiseSpiltCache,
TablewiseEmbeddingBagConfig,
)
from .colo_module import ColoModule from .colo_module import ColoModule
from .linear import ColoLinear
from .embedding import ColoEmbedding from .embedding import ColoEmbedding
from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module from .linear import ColoLinear
from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
__all__ = [ __all__ = [
'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module', 'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',

View File

@ -1,8 +1,8 @@
from .cache_mgr import CachedParamMgr, EvictionStrategy from .cache_mgr import CachedParamMgr, EvictionStrategy
from .copyer import LimitBuffIndexCopyer
from .cached_embedding import CachedEmbeddingBag from .cached_embedding import CachedEmbeddingBag
from .parallel_cached_embedding import ParallelCachedEmbeddingBag from .copyer import LimitBuffIndexCopyer
from .embedding_config import TablewiseEmbeddingBagConfig from .embedding_config import TablewiseEmbeddingBagConfig
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache

View File

@ -1,4 +1,5 @@
import abc import abc
import torch.nn as nn import torch.nn as nn

View File

@ -1,12 +1,14 @@
import numpy as np
import torch
from torch.profiler import record_function
from typing import List, Optional
from contexttimer import Timer
from .copyer import LimitBuffIndexCopyer
from enum import Enum
import sys import sys
from contextlib import contextmanager from contextlib import contextmanager
from enum import Enum
from typing import List, Optional
import numpy as np
import torch
from contexttimer import Timer
from torch.profiler import record_function
from .copyer import LimitBuffIndexCopyer
class EvictionStrategy(Enum): class EvictionStrategy(Enum):
@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
class CachedParamMgr(torch.nn.Module): class CachedParamMgr(torch.nn.Module):
""" """
Manage Embedding Weights on CPU and CUDA memory uses a software cache. Manage Embedding Weights on CPU and CUDA memory uses a software cache.
CPU maintains the entire original weight. CPU maintains the entire original weight.
CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`. CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
During training, GPU needs to transmit embedding rows between CPU and GPU. During training, GPU needs to transmit embedding rows between CPU and GPU.
Args: Args:
@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
self._elapsed_dict[name] += t.elapsed self._elapsed_dict[name] += t.elapsed
def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor: def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
"""_find_evict_gpu_idxs """_find_evict_gpu_idxs
Find the gpu idxs to be evicted, according to their freq. Find the gpu idxs to be evicted, according to their freq.
Args: Args:
evict_num (int): how many rows has to be evicted evict_num (int): how many rows has to be evicted
@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
"""reorder """reorder
reorder the weight according to ids' frequency in dataset before training. reorder the weight according to ids' frequency in dataset before training.
Execute only once before training, also known as warmup phase. Execute only once before training, also known as warmup phase.
Note: Note:
If you would like to use the DATASET as the eviction strategy, you must call this function. If you would like to use the DATASET as the eviction strategy, you must call this function.
Note: Note:
@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
""" """
deprecated deprecated
evict one row from cuda to cpu. evict one row from cuda to cpu.
Returns: Returns:
(int) : the slot id be evicted. (int) : the slot id be evicted.
""" """
mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1) mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)

View File

@ -1,10 +1,11 @@
from typing import Iterator, List, Optional, Tuple, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple, Union from torch.nn.parameter import Parameter
from .base_embedding import BaseEmbeddingBag from .base_embedding import BaseEmbeddingBag
from .cache_mgr import CachedParamMgr, EvictionStrategy from .cache_mgr import CachedParamMgr, EvictionStrategy
from torch.nn.parameter import Parameter
class CachedEmbeddingBag(BaseEmbeddingBag): class CachedEmbeddingBag(BaseEmbeddingBag):
@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False. include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32. dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu. device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None. ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7. warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0. buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
buffer_size=50_000, buffer_size=50_000,
pin_weight=False): pin_weight=False):
""" """
Called after initialized. Called after initialized.
Reorder the weight rows according to the ids_freq_mapping. Reorder the weight rows according to the ids_freq_mapping.
Then, let the weights of the Module be managed by a CachedParamMgr. Then, let the weights of the Module be managed by a CachedParamMgr.
Args: Args:
cuda_row_num (int): number of rows can be hosted in CUDA memory cuda_row_num (int): number of rows can be hosted in CUDA memory
ids_freq_mapping (List[int]): a list, idx is id number, value is freq ids_freq_mapping (List[int]): a list, idx is id number, value is freq

View File

@ -3,7 +3,7 @@ from torch import LongTensor
class LimitBuffIndexCopyer(object): class LimitBuffIndexCopyer(object):
"""LimitBuffIndexCopyer """LimitBuffIndexCopyer
Index Copy using limited temp buffer on CUDA. Index Copy using limited temp buffer on CUDA.
Args: Args:
@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
@torch.no_grad() @torch.no_grad()
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor): def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
"""copy """copy
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index] src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered. The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.

View File

@ -1,12 +1,13 @@
from typing import Iterator, List, Optional, Tuple
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from typing import List, Optional, Iterator, Tuple
from .cached_embedding import CachedEmbeddingBag from colossalai.legacy.nn._ops._utils import dual_all_to_all
from colossalai.nn._ops._utils import dual_all_to_all from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
from .cache_mgr import CachedParamMgr, EvictionStrategy from .cache_mgr import CachedParamMgr, EvictionStrategy
from .cached_embedding import CachedEmbeddingBag
def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]: def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:

View File

@ -1,15 +1,16 @@
import time
from typing import List
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn.functional as F import torch.nn.functional as F
from .cached_embedding import CachedEmbeddingBag from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
from .cache_mgr import EvictionStrategy
from .embedding_config import TablewiseEmbeddingBagConfig
from colossalai.tensor import ProcessGroup from colossalai.tensor import ProcessGroup
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
from typing import List from .cache_mgr import EvictionStrategy
import time from .cached_embedding import CachedEmbeddingBag
from .embedding_config import TablewiseEmbeddingBagConfig
class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag): class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):

View File

@ -1,17 +1,17 @@
import abc
from typing import List
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn as nn import torch.nn as nn
from torch.profiler import record_function from torch.profiler import record_function
from .cached_embedding import CachedEmbeddingBag from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
from colossalai.tensor import ProcessGroup from colossalai.tensor import ProcessGroup
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
from .embedding_config import TablewiseEmbeddingBagConfig
from .cache_mgr import EvictionStrategy
from typing import List from .cache_mgr import EvictionStrategy
import abc from .cached_embedding import CachedEmbeddingBag
from .embedding_config import TablewiseEmbeddingBagConfig
class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module): class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):

View File

@ -1,6 +1,7 @@
from colossalai.tensor.distspec import _DistSpec from typing import Dict, List
from colossalai.tensor import ComputePattern from colossalai.tensor import ComputePattern
from typing import List, Dict from colossalai.tensor.distspec import _DistSpec
class ColoModule(object): class ColoModule(object):

View File

@ -1,5 +1,6 @@
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
from .colo_module import ColoModule from .colo_module import ColoModule
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
class ColoEmbedding(ColoModule): class ColoEmbedding(ColoModule):

View File

@ -1,5 +1,6 @@
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
from .colo_module import ColoModule from .colo_module import ColoModule
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
class ColoLinear(ColoModule): class ColoLinear(ColoModule):

View File

@ -1,9 +1,11 @@
from typing import Dict from typing import Dict
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
from colossalai.tensor import distspec
from . import ColoModule
import torch import torch
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
from . import ColoModule
_COLOSSAL_MODULES: Dict[type, ColoModule] = {} _COLOSSAL_MODULES: Dict[type, ColoModule] = {}

View File

@ -7,9 +7,9 @@ from typing import Callable
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from colossalai.communication import all_reduce
from colossalai.context import ParallelMode from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.legacy.communication import all_reduce
from colossalai.legacy.registry import HOOKS from colossalai.legacy.registry import HOOKS
from colossalai.utils import get_current_device, is_no_pp_or_last_stage from colossalai.utils import get_current_device, is_no_pp_or_last_stage

View File

@ -6,8 +6,7 @@ import logging
from pathlib import Path from pathlib import Path
from typing import List, Union from typing import List, Union
import colossalai import torch.distributed as dist
from colossalai.context.parallel_mode import ParallelMode
class DistributedLogger: class DistributedLogger:
@ -63,6 +62,7 @@ class DistributedLogger:
self._logger.propagate = False self._logger.propagate = False
DistributedLogger.__instances[name] = self DistributedLogger.__instances[name] = self
self.rank = dist.get_rank() if dist.is_initialized() else 0
@staticmethod @staticmethod
def __get_call_info(): def __get_call_info():
@ -109,16 +109,10 @@ class DistributedLogger:
# create log directory # create log directory
path.mkdir(parents=True, exist_ok=True) path.mkdir(parents=True, exist_ok=True)
# set the default file name if path is a directory
if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
rank = 0
else:
rank = colossalai.core.global_context.get_global_rank()
if suffix is not None: if suffix is not None:
log_file_name = f'rank_{rank}_{suffix}.log' log_file_name = f'rank_{self.rank}_{suffix}.log'
else: else:
log_file_name = f'rank_{rank}.log' log_file_name = f'rank_{self.rank}.log'
path = path.joinpath(log_file_name) path = path.joinpath(log_file_name)
# add file handler # add file handler
@ -128,19 +122,14 @@ class DistributedLogger:
file_handler.setFormatter(formatter) file_handler.setFormatter(formatter)
self._logger.addHandler(file_handler) self._logger.addHandler(file_handler)
def _log(self, def _log(self, level, message: str, ranks: List[int] = None) -> None:
level,
message: str,
parallel_mode: ParallelMode = ParallelMode.GLOBAL,
ranks: List[int] = None) -> None:
if ranks is None: if ranks is None:
getattr(self._logger, level)(message) getattr(self._logger, level)(message)
else: else:
local_rank = colossalai.core.global_context.get_local_rank(parallel_mode) if self.rank in ranks:
if local_rank in ranks:
getattr(self._logger, level)(message) getattr(self._logger, level)(message)
def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: def info(self, message: str, ranks: List[int] = None) -> None:
"""Log an info message. """Log an info message.
Args: Args:
@ -150,10 +139,10 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks. ranks (List[int]): List of parallel ranks.
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('info', message_prefix, parallel_mode, ranks) self._log('info', message_prefix, ranks)
self._log('info', message, parallel_mode, ranks) self._log('info', message, ranks)
def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: def warning(self, message: str, ranks: List[int] = None) -> None:
"""Log a warning message. """Log a warning message.
Args: Args:
@ -163,10 +152,10 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks. ranks (List[int]): List of parallel ranks.
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('warning', message_prefix, parallel_mode, ranks) self._log('warning', message_prefix, ranks)
self._log('warning', message, parallel_mode, ranks) self._log('warning', message, ranks)
def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: def debug(self, message: str, ranks: List[int] = None) -> None:
"""Log a debug message. """Log a debug message.
Args: Args:
@ -176,10 +165,10 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks. ranks (List[int]): List of parallel ranks.
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('debug', message_prefix, parallel_mode, ranks) self._log('debug', message_prefix, ranks)
self._log('debug', message, parallel_mode, ranks) self._log('debug', message, ranks)
def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: def error(self, message: str, ranks: List[int] = None) -> None:
"""Log an error message. """Log an error message.
Args: Args:
@ -189,5 +178,5 @@ class DistributedLogger:
ranks (List[int]): List of parallel ranks. ranks (List[int]): List of parallel ranks.
""" """
message_prefix = "{}:{} {}".format(*self.__get_call_info()) message_prefix = "{}:{} {}".format(*self.__get_call_info())
self._log('error', message_prefix, parallel_mode, ranks) self._log('error', message_prefix, ranks)
self._log('error', message, parallel_mode, ranks) self._log('error', message, ranks)

View File

@ -1,6 +1,5 @@
from ._ops import * from .init import *
from .layer import * from .layer import *
from .loss import * from .loss import *
from .lr_scheduler import * from .lr_scheduler import *
from .metric import *
from .optimizer import * from .optimizer import *

View File

@ -1,10 +1,2 @@
from .colossalai_layer import *
from .parallel_1d import *
from .parallel_2d import *
from .parallel_2p5d import *
from .parallel_3d import *
from .parallel_sequence import *
from .moe import * from .moe import *
from .utils import * from .utils import *
from .vanilla import *
from .wrapper import *

View File

@ -1,7 +0,0 @@
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
__all__ = [
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
]

View File

@ -0,0 +1,14 @@
def divide(numerator, denominator):
"""Only allow exact division.
Args:
numerator (int): Numerator of the division.
denominator (int): Denominator of the division.
Returns:
int: the result of exact division.
"""
assert denominator != 0, 'denominator can not be zero'
assert numerator % denominator == 0, \
'{} is not divisible by {}'.format(numerator, denominator)
return numerator // denominator

View File

@ -1,7 +0,0 @@
from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
__all__ = [
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
]

View File

@ -1,41 +1 @@
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.nn.layer.utils import get_tensor_parallel_mode
from torch import nn
from torch.nn.modules.loss import *
from torch.nn.modules.loss import _Loss
from .loss_1d import VocabParallelCrossEntropyLoss1D
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
from .loss_moe import MoeCrossEntropyLoss, MoeLoss from .loss_moe import MoeCrossEntropyLoss, MoeLoss
_parallel_cross_entropy = {
'2d': CrossEntropyLoss2D,
'2.5d': CrossEntropyLoss2p5D,
'3d': CrossEntropyLoss3D,
}
_vocab_parallel_cross_entropy = {
'1d': VocabParallelCrossEntropyLoss1D,
'2d': VocabParallelCrossEntropyLoss2D,
'2.5d': VocabParallelCrossEntropyLoss2p5D,
'3d': VocabParallelCrossEntropyLoss3D,
}
class CrossEntropyLoss(_Loss):
def __init__(self, reduction: bool = True, *args, **kwargs):
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is not None and env.vocab_parallel:
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
elif tensor_parallel is None or tensor_parallel == '1d':
reduction = 'mean' if reduction else 'none'
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
else:
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
def forward(self, *args):
return self.loss(*args)

View File

@ -1,11 +1,8 @@
from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
@LR_SCHEDULERS.register_module
class CosineAnnealingLR(_CosineAnnealingLR): class CosineAnnealingLR(_CosineAnnealingLR):
r"""Set the learning rate of each parameter group using a cosine annealing r"""Set the learning rate of each parameter group using a cosine annealing
schedule, where :math:`\eta_{max}` is set to the initial lr and schedule, where :math:`\eta_{max}` is set to the initial lr and
@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch) super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class CosineAnnealingWarmupLR(WarmupScheduler): class CosineAnnealingWarmupLR(WarmupScheduler):
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied. """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
super().__init__(optimizer, warmup_steps, base_scheduler) super().__init__(optimizer, warmup_steps, base_scheduler)
@LR_SCHEDULERS.register_module
class FlatAnnealingLR(DelayerScheduler): class FlatAnnealingLR(DelayerScheduler):
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay. """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch) super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class FlatAnnealingWarmupLR(WarmupDelayerScheduler): class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
applied, and then the learning rate will be a fixed value before starting decay. applied, and then the learning rate will be a fixed value before starting decay.

View File

@ -1,9 +1,6 @@
from torch.optim.lr_scheduler import _LRScheduler from torch.optim.lr_scheduler import _LRScheduler
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class LinearWarmupLR(_LRScheduler): class LinearWarmupLR(_LRScheduler):
"""Linearly warmup learning rate and then linearly decay. """Linearly warmup learning rate and then linearly decay.

View File

@ -2,12 +2,9 @@ from typing import List
from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
class MultiStepLR(_MultiStepLR): class MultiStepLR(_MultiStepLR):
"""Decays the learning rate of each parameter group by gamma once the """Decays the learning rate of each parameter group by gamma once the
number of epoch reaches one of the milestones. Notice that such decay can number of epoch reaches one of the milestones. Notice that such decay can
@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch) super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class MultiStepWarmupLR(WarmupScheduler): class MultiStepWarmupLR(WarmupScheduler):
"""Multistep learning rate scheduler with warmup. """Multistep learning rate scheduler with warmup.

View File

@ -1,9 +1,6 @@
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class OneCycleLR(_OneCycleLR): class OneCycleLR(_OneCycleLR):
r"""Sets the learning rate of each parameter group according to the r"""Sets the learning rate of each parameter group according to the
1cycle learning rate policy. The 1cycle policy anneals the learning 1cycle learning rate policy. The 1cycle policy anneals the learning

View File

@ -1,11 +1,8 @@
from torch.optim.lr_scheduler import _LRScheduler from torch.optim.lr_scheduler import _LRScheduler
from colossalai.legacy.registry import LR_SCHEDULERS
from .delayed import WarmupScheduler from .delayed import WarmupScheduler
@LR_SCHEDULERS.register_module
class PolynomialLR(_LRScheduler): class PolynomialLR(_LRScheduler):
"""Polynomial learning rate scheduler. """Polynomial learning rate scheduler.
@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
for base_lr in self.base_lrs] for base_lr in self.base_lrs]
@LR_SCHEDULERS.register_module
class PolynomialWarmupLR(WarmupScheduler): class PolynomialWarmupLR(WarmupScheduler):
"""Polynomial learning rate scheduler with warmup. """Polynomial learning rate scheduler with warmup.

View File

@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
from torch.optim.lr_scheduler import StepLR as _StepLR from torch.optim.lr_scheduler import StepLR as _StepLR
from colossalai.legacy.registry import LR_SCHEDULERS
@LR_SCHEDULERS.register_module
class LambdaLR(_LambdaLR): class LambdaLR(_LambdaLR):
"""Sets the learning rate of each parameter group to the initial lr """Sets the learning rate of each parameter group to the initial lr
times a given function. When last_epoch=-1, sets initial lr as lr. times a given function. When last_epoch=-1, sets initial lr as lr.
@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch) super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class MultiplicativeLR(_MultiplicativeLR): class MultiplicativeLR(_MultiplicativeLR):
"""Multiply the learning rate of each parameter group by the factor given """Multiply the learning rate of each parameter group by the factor given
in the specified function. When last_epoch=-1, sets initial lr as lr. in the specified function. When last_epoch=-1, sets initial lr as lr.
@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch) super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class StepLR(_StepLR): class StepLR(_StepLR):
"""Decays the learning rate of each parameter group by gamma every """Decays the learning rate of each parameter group by gamma every
step_size epochs. Notice that such decay can happen simultaneously with step_size epochs. Notice that such decay can happen simultaneously with
@ -61,7 +56,6 @@ class StepLR(_StepLR):
super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch) super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
@LR_SCHEDULERS.register_module
class ExponentialLR(_ExponentialLR): class ExponentialLR(_ExponentialLR):
"""Decays the learning rate of each parameter group by gamma every epoch. """Decays the learning rate of each parameter group by gamma every epoch.
When last_epoch=-1, sets initial lr as lr When last_epoch=-1, sets initial lr as lr

Some files were not shown because too many files have changed in this diff Show More