mirror of https://github.com/hpcaitech/ColossalAI
[legacy] move communication and nn to legacy and refactor logger (#4671)
* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc checkpull/4692/head
parent
536397cc95
commit
554aa9592e
|
@ -4,7 +4,7 @@ from typing import Optional, Set
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from colossalai.nn.parallel.data_parallel import _cast_float
|
||||
from colossalai.utils import _cast_float
|
||||
from colossalai.zero.legacy.gemini.tensor_utils import free_storage
|
||||
|
||||
from .region_manager import RegionManager
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
class Registry:
|
||||
# TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
|
|
@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.optim import Optimizer
|
||||
from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
|
||||
from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
|
||||
|
||||
from colossalai.interface import ModelWrapper, OptimizerWrapper
|
||||
from colossalai.nn.optimizer import ColossalaiOptimizer
|
||||
|
@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
|
|||
checkpoint_path (str): Path to the checkpoint directory.
|
||||
is_master (bool): Whether current rank is main process.
|
||||
"""
|
||||
try:
|
||||
from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
|
||||
from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
|
||||
except ImportError:
|
||||
return
|
||||
if not isinstance(model, PreTrainedModel):
|
||||
return
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import torch
|
||||
|
||||
import colossalai.nn as col_nn
|
||||
import colossalai.legacy.nn as col_nn
|
||||
|
||||
|
||||
class MLP(torch.nn.Module):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import torch
|
||||
|
||||
from colossalai.nn.layer.colossalai_layer import Embedding, Linear
|
||||
from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
from .bias_dropout_add import bias_dropout_add_fused_train
|
||||
|
|
|
@ -1,9 +1,17 @@
|
|||
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
|
||||
from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
|
||||
send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
|
||||
recv_forward, recv_backward)
|
||||
from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter
|
||||
from .p2p import (
|
||||
recv_backward,
|
||||
recv_forward,
|
||||
send_backward,
|
||||
send_backward_recv_backward,
|
||||
send_backward_recv_forward,
|
||||
send_forward,
|
||||
send_forward_backward_recv_forward_backward,
|
||||
send_forward_recv_backward,
|
||||
send_forward_recv_forward,
|
||||
)
|
||||
from .ring import ring_forward
|
||||
from .utils import send_obj_meta, recv_obj_meta
|
||||
from .utils import recv_obj_meta, send_obj_meta
|
||||
|
||||
__all__ = [
|
||||
'all_gather',
|
|
@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union
|
|||
|
||||
import torch.cuda
|
||||
|
||||
import colossalai.communication as comm
|
||||
import colossalai.legacy.communication as comm
|
||||
from colossalai.amp.naive_amp import NaiveAMPModel
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
|
|
|
@ -5,10 +5,10 @@ from typing import Iterable, Tuple
|
|||
|
||||
import torch.cuda
|
||||
|
||||
import colossalai.communication.p2p_v2 as comm
|
||||
from colossalai import engine
|
||||
import colossalai.legacy.communication.p2p_v2 as comm
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.engine import Engine
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
|
||||
from ._pipeline_schedule import PipelineSchedule
|
||||
|
@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule):
|
|||
"""
|
||||
|
||||
def forward_backward_step(self,
|
||||
engine: engine.Engine,
|
||||
engine: Engine,
|
||||
data_iter: Iterable,
|
||||
forward_only=False,
|
||||
return_loss=True,
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
from ._ops import *
|
||||
from .layer import *
|
||||
from .loss import *
|
||||
from .metric import *
|
|
@ -4,7 +4,7 @@ import torch
|
|||
import torch.distributed as dist
|
||||
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.nn.layer.utils import divide
|
||||
from colossalai.legacy.nn.layer.utils import divide
|
||||
from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
|
||||
|
||||
GeneralTensor = Union[ColoTensor, torch.Tensor]
|
||||
|
@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
|
|||
return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
|
||||
|
||||
|
||||
### table wise embedding shard
|
||||
# table wise embedding shard
|
||||
|
||||
|
||||
def _all_to_all_for_tablewise(x: torch.Tensor,
|
|
@ -1,8 +1,10 @@
|
|||
import torch.nn.functional as F
|
||||
from typing import Optional
|
||||
|
||||
import torch.nn.functional as F
|
||||
|
||||
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
|
||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||
from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
|
||||
ReplicaSpec
|
||||
|
||||
from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
|
||||
|
||||
|
|
@ -1,9 +1,11 @@
|
|||
import torch.nn.functional as F
|
||||
from typing import Optional
|
||||
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor
|
||||
|
||||
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
|
||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||
from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
|
||||
ShardSpec, ReplicaSpec
|
||||
|
||||
from ._utils import GeneralTensor, convert_to_colo_tensor
|
||||
|
||||
|
|
@ -1,7 +1,10 @@
|
|||
from typing import List, Optional
|
||||
|
||||
import torch.nn.functional as F
|
||||
|
||||
from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
|
||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||
from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
|
||||
|
||||
from ._utils import GeneralTensor, convert_to_colo_tensor
|
||||
|
||||
|
|
@ -1,9 +1,12 @@
|
|||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing import Optional
|
||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||
|
||||
from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
|
||||
from colossalai.tensor import ColoTensor, ColoTensorSpec
|
||||
from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
|
||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||
|
||||
from ._utils import GeneralTensor, convert_to_colo_tensor
|
||||
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
from .colossalai_layer import *
|
||||
from .parallel_1d import *
|
||||
from .parallel_2d import *
|
||||
from .parallel_2p5d import *
|
||||
from .parallel_3d import *
|
||||
from .parallel_sequence import *
|
||||
from .utils import *
|
||||
from .vanilla import *
|
||||
from .wrapper import *
|
|
@ -1,7 +1,7 @@
|
|||
from ._utils import partition_batch
|
||||
from .dropout import Dropout
|
||||
from .embedding import Embedding, PatchEmbedding
|
||||
from .linear import Classifier, Linear
|
||||
from .normalization import LayerNorm
|
||||
|
||||
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
|
||||
from ._utils import partition_batch
|
||||
from .dropout import Dropout
|
||||
from .embedding import Embedding, PatchEmbedding
|
||||
from .linear import Classifier, Linear
|
||||
from .normalization import LayerNorm
|
||||
|
||||
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
|
|
@ -1,151 +1,152 @@
|
|||
import math
|
||||
from typing import Callable
|
||||
|
||||
from colossalai.utils import get_current_device
|
||||
from torch import dtype, nn
|
||||
|
||||
from ... import init as init
|
||||
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
|
||||
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
|
||||
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
|
||||
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
|
||||
from ..utils import get_tensor_parallel_mode
|
||||
from ..vanilla import VanillaPatchEmbedding
|
||||
from ._utils import ColossalaiModule
|
||||
|
||||
_parallel_embedding = {
|
||||
'1d': Embedding1D,
|
||||
'2d': Embedding2D,
|
||||
'2.5d': Embedding2p5D,
|
||||
'3d': Embedding3D,
|
||||
}
|
||||
|
||||
_vocab_parallel_embedding = {
|
||||
'1d': VocabParallelEmbedding1D,
|
||||
'2d': VocabParallelEmbedding2D,
|
||||
'2.5d': VocabParallelEmbedding2p5D,
|
||||
'3d': VocabParallelEmbedding3D
|
||||
}
|
||||
|
||||
_parallel_patchembedding = {
|
||||
None: VanillaPatchEmbedding,
|
||||
'1d': PatchEmbedding1D,
|
||||
'2d': PatchEmbedding2D,
|
||||
'2.5d': PatchEmbedding2p5D,
|
||||
'3d': PatchEmbedding3D
|
||||
}
|
||||
|
||||
|
||||
class Embedding(ColossalaiModule):
|
||||
r"""Embedding for colossalai.
|
||||
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
padding_idx: int = None,
|
||||
dtype: dtype = None,
|
||||
weight_initializer: Callable = init.normal_(),
|
||||
vocab_parallel_limit: int = 2048,
|
||||
*args,
|
||||
**kwargs) -> None:
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel is None:
|
||||
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
|
||||
**kwargs).to(dtype).to(get_current_device())
|
||||
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
|
||||
elif num_embeddings <= vocab_parallel_limit:
|
||||
embed = _parallel_embedding[tensor_parallel](
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
padding_idx=padding_idx,
|
||||
dtype=dtype,
|
||||
weight_initializer=weight_initializer,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
embed = _vocab_parallel_embedding[tensor_parallel](
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
padding_idx=padding_idx,
|
||||
dtype=dtype,
|
||||
weight_initializer=weight_initializer,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
super().__init__(embed)
|
||||
|
||||
|
||||
class PatchEmbedding(ColossalaiModule):
|
||||
"""2D Image to Patch Embedding.
|
||||
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
img_size: int,
|
||||
patch_size: int,
|
||||
in_chans: int,
|
||||
embed_size: int,
|
||||
dtype: dtype = None,
|
||||
flatten: bool = True,
|
||||
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
|
||||
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
|
||||
position_embed_initializer: Callable = init.zeros_()
|
||||
) -> None:
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
embed = _parallel_patchembedding[tensor_parallel](
|
||||
img_size,
|
||||
patch_size,
|
||||
in_chans,
|
||||
embed_size,
|
||||
dtype=dtype,
|
||||
flatten=flatten,
|
||||
weight_initializer=weight_initializer,
|
||||
bias_initializer=bias_initializer,
|
||||
position_embed_initializer=position_embed_initializer,
|
||||
)
|
||||
super().__init__(embed)
|
||||
import math
|
||||
from typing import Callable
|
||||
|
||||
from torch import dtype, nn
|
||||
|
||||
from colossalai.nn import init
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
|
||||
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
|
||||
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
|
||||
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
|
||||
from ..utils import get_tensor_parallel_mode
|
||||
from ..vanilla import VanillaPatchEmbedding
|
||||
from ._utils import ColossalaiModule
|
||||
|
||||
_parallel_embedding = {
|
||||
'1d': Embedding1D,
|
||||
'2d': Embedding2D,
|
||||
'2.5d': Embedding2p5D,
|
||||
'3d': Embedding3D,
|
||||
}
|
||||
|
||||
_vocab_parallel_embedding = {
|
||||
'1d': VocabParallelEmbedding1D,
|
||||
'2d': VocabParallelEmbedding2D,
|
||||
'2.5d': VocabParallelEmbedding2p5D,
|
||||
'3d': VocabParallelEmbedding3D
|
||||
}
|
||||
|
||||
_parallel_patchembedding = {
|
||||
None: VanillaPatchEmbedding,
|
||||
'1d': PatchEmbedding1D,
|
||||
'2d': PatchEmbedding2D,
|
||||
'2.5d': PatchEmbedding2p5D,
|
||||
'3d': PatchEmbedding3D
|
||||
}
|
||||
|
||||
|
||||
class Embedding(ColossalaiModule):
|
||||
r"""Embedding for colossalai.
|
||||
|
||||
Args:
|
||||
num_embeddings (int): number of embeddings.
|
||||
embedding_dim (int): dimension of embedding.
|
||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||
therefore, the embedding vector at padding_idx is not updated during training,
|
||||
i.e. it remains as a fixed “pad”, defaults to None.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
he initializer of weight, defaults to normal initializer.
|
||||
|
||||
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
|
||||
::
|
||||
|
||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||
of frequency of the words in the mini-batch. Default False.
|
||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||
|
||||
More details about ``args`` and ``kwargs`` could be found in
|
||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
padding_idx: int = None,
|
||||
dtype: dtype = None,
|
||||
weight_initializer: Callable = init.normal_(),
|
||||
vocab_parallel_limit: int = 2048,
|
||||
*args,
|
||||
**kwargs) -> None:
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel is None:
|
||||
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
|
||||
**kwargs).to(dtype).to(get_current_device())
|
||||
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
|
||||
elif num_embeddings <= vocab_parallel_limit:
|
||||
embed = _parallel_embedding[tensor_parallel](
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
padding_idx=padding_idx,
|
||||
dtype=dtype,
|
||||
weight_initializer=weight_initializer,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
embed = _vocab_parallel_embedding[tensor_parallel](
|
||||
num_embeddings,
|
||||
embedding_dim,
|
||||
padding_idx=padding_idx,
|
||||
dtype=dtype,
|
||||
weight_initializer=weight_initializer,
|
||||
*args,
|
||||
**kwargs,
|
||||
)
|
||||
super().__init__(embed)
|
||||
|
||||
|
||||
class PatchEmbedding(ColossalaiModule):
|
||||
"""2D Image to Patch Embedding.
|
||||
|
||||
Args:
|
||||
img_size (int): image size.
|
||||
patch_size (int): patch size.
|
||||
in_chans (int): number of channels of input image.
|
||||
embed_size (int): size of embedding.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||
weight_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of weight, defaults to kaiming uniform initializer.
|
||||
bias_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of bias, defaults to xavier uniform initializer.
|
||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||
The initializer of position embedding, defaults to zeros initializer.
|
||||
|
||||
More details about ``initializer`` please refer to
|
||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
img_size: int,
|
||||
patch_size: int,
|
||||
in_chans: int,
|
||||
embed_size: int,
|
||||
dtype: dtype = None,
|
||||
flatten: bool = True,
|
||||
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
|
||||
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
|
||||
position_embed_initializer: Callable = init.zeros_()
|
||||
) -> None:
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
embed = _parallel_patchembedding[tensor_parallel](
|
||||
img_size,
|
||||
patch_size,
|
||||
in_chans,
|
||||
embed_size,
|
||||
dtype=dtype,
|
||||
flatten=flatten,
|
||||
weight_initializer=weight_initializer,
|
||||
bias_initializer=bias_initializer,
|
||||
position_embed_initializer=position_embed_initializer,
|
||||
)
|
||||
super().__init__(embed)
|
|
@ -4,9 +4,9 @@ from typing import Callable
|
|||
|
||||
from torch import dtype, nn
|
||||
|
||||
from colossalai.nn import init
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
from ... import init as init
|
||||
from ..parallel_1d import *
|
||||
from ..parallel_2d import *
|
||||
from ..parallel_2p5d import *
|
|
@ -1,41 +1,42 @@
|
|||
from colossalai.utils import get_current_device
|
||||
from torch import nn
|
||||
|
||||
from ..parallel_1d import LayerNorm1D
|
||||
from ..parallel_2d import LayerNorm2D
|
||||
from ..parallel_2p5d import LayerNorm2p5D
|
||||
from ..parallel_3d import LayerNorm3D
|
||||
from ..utils import get_tensor_parallel_mode
|
||||
from ..vanilla import VanillaLayerNorm
|
||||
from ._utils import ColossalaiModule
|
||||
|
||||
_parallel_layernorm = {
|
||||
None: VanillaLayerNorm,
|
||||
"1d": LayerNorm1D,
|
||||
"2d": LayerNorm2D,
|
||||
"2.5d": LayerNorm2p5D,
|
||||
"3d": LayerNorm3D,
|
||||
}
|
||||
|
||||
|
||||
class LayerNorm(ColossalaiModule):
|
||||
r"""Layer Normalization for colossalai.
|
||||
|
||||
Args:
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
|
||||
bias (bool, optional): Whether to add a bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel is None:
|
||||
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
|
||||
else:
|
||||
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
|
||||
super().__init__(norm)
|
||||
from torch import nn
|
||||
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
from ..parallel_1d import LayerNorm1D
|
||||
from ..parallel_2d import LayerNorm2D
|
||||
from ..parallel_2p5d import LayerNorm2p5D
|
||||
from ..parallel_3d import LayerNorm3D
|
||||
from ..utils import get_tensor_parallel_mode
|
||||
from ..vanilla import VanillaLayerNorm
|
||||
from ._utils import ColossalaiModule
|
||||
|
||||
_parallel_layernorm = {
|
||||
None: VanillaLayerNorm,
|
||||
"1d": LayerNorm1D,
|
||||
"2d": LayerNorm2D,
|
||||
"2.5d": LayerNorm2p5D,
|
||||
"3d": LayerNorm3D,
|
||||
}
|
||||
|
||||
|
||||
class LayerNorm(ColossalaiModule):
|
||||
r"""Layer Normalization for colossalai.
|
||||
|
||||
Args:
|
||||
normalized_shape (int): input shape from an expected input of size.
|
||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||
If a single integer is used, it is treated as a singleton list, and this module will
|
||||
normalize over the last dimension which is expected to be of that specific size.
|
||||
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
|
||||
bias (bool, optional): Whether to add a bias, defaults to ``True``.
|
||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel is None:
|
||||
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
|
||||
else:
|
||||
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
|
||||
super().__init__(norm)
|
|
@ -0,0 +1,17 @@
|
|||
from .layers import (
|
||||
Classifier1D,
|
||||
Dropout1D,
|
||||
Embedding1D,
|
||||
LayerNorm1D,
|
||||
Linear1D,
|
||||
Linear1D_Col,
|
||||
Linear1D_Row,
|
||||
PatchEmbedding1D,
|
||||
VocabParallelClassifier1D,
|
||||
VocabParallelEmbedding1D,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
|
||||
'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
|
||||
]
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
|
||||
|
@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
|
|||
class _SplitForwardGatherBackward(torch.autograd.Function):
|
||||
"""
|
||||
Split the input and keep only the corresponding chuck to the rank.
|
||||
|
||||
|
||||
Args:
|
||||
input_: input matrix.
|
||||
parallel_mode: parallel mode.
|
|
@ -10,11 +10,11 @@ import torch.nn.functional as F
|
|||
from torch import Tensor
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from colossalai.communication import broadcast
|
||||
from colossalai.context import ParallelMode, seed
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.kernel import LayerNorm
|
||||
from colossalai.legacy.communication import broadcast
|
||||
from colossalai.legacy.registry import LAYERS
|
||||
from colossalai.nn import init as init
|
||||
from colossalai.utils.checkpointing import (
|
|
@ -1,6 +1,13 @@
|
|||
from ._operation import reduce_by_batch_2d, split_batch_2d
|
||||
from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
|
||||
VocabParallelEmbedding2D)
|
||||
from .layers import (
|
||||
Classifier2D,
|
||||
Embedding2D,
|
||||
LayerNorm2D,
|
||||
Linear2D,
|
||||
PatchEmbedding2D,
|
||||
VocabParallelClassifier2D,
|
||||
VocabParallelEmbedding2D,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
|
|
@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
|
|||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import get_current_device
|
||||
from torch import Tensor
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def matmul_2d(
|
||||
|
@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
|
|||
col_group = gpc.get_group(col_parallel_mode)
|
||||
|
||||
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
|
||||
opa = [None] * 2
|
||||
opb = [None] * 2
|
||||
|
@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
|
|||
col_group = gpc.get_group(col_parallel_mode)
|
||||
|
||||
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
|
||||
opb = [None] * 2
|
||||
opr = [None] * 2
|
||||
|
@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
|
|||
col_group = gpc.get_group(col_parallel_mode)
|
||||
|
||||
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
pipeline_parallel_rank * tensor_parallel_size
|
||||
|
||||
opa = [None] * 2
|
||||
opr = [None] * 2
|
|
@ -8,10 +8,10 @@ import torch.nn.functional as F
|
|||
from torch import Tensor
|
||||
from torch.nn import Parameter
|
||||
|
||||
from colossalai.communication import broadcast
|
||||
from colossalai.context import ParallelMode, seed
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.legacy.communication import broadcast
|
||||
from colossalai.legacy.registry import LAYERS
|
||||
from colossalai.nn import init as init
|
||||
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
|
|
@ -1,6 +1,13 @@
|
|||
from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
|
||||
from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
|
||||
VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
|
||||
from .layers import (
|
||||
Classifier2p5D,
|
||||
Embedding2p5D,
|
||||
LayerNorm2p5D,
|
||||
Linear2p5D,
|
||||
PatchEmbedding2p5D,
|
||||
VocabParallelClassifier2p5D,
|
||||
VocabParallelEmbedding2p5D,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
|
|
@ -2,13 +2,14 @@ from typing import Any, Tuple
|
|||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import get_current_device
|
||||
from torch import Tensor
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def get_parallel_group(parallel_mode: ParallelMode):
|
||||
return gpc.get_group(parallel_mode)
|
|
@ -8,10 +8,10 @@ import torch.nn.functional as F
|
|||
from torch import Tensor
|
||||
from torch.nn import Parameter
|
||||
|
||||
from colossalai.communication import broadcast
|
||||
from colossalai.context import ParallelMode, seed
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.legacy.communication import broadcast
|
||||
from colossalai.legacy.registry import LAYERS
|
||||
from colossalai.nn import init as init
|
||||
from colossalai.utils.checkpointing import (
|
|
@ -1,6 +1,13 @@
|
|||
from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
|
||||
from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
|
||||
VocabParallelEmbedding3D)
|
||||
from .layers import (
|
||||
Classifier3D,
|
||||
Embedding3D,
|
||||
LayerNorm3D,
|
||||
Linear3D,
|
||||
PatchEmbedding3D,
|
||||
VocabParallelClassifier3D,
|
||||
VocabParallelEmbedding3D,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
|
|
@ -7,10 +7,10 @@ import torch
|
|||
from torch import Tensor
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
|
||||
from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
|
||||
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
|
||||
|
||||
from ._utils import get_parallel_mode_from_env, push_async_grad
|
||||
|
|
@ -8,14 +8,14 @@ import torch.nn.functional as F
|
|||
from torch import Tensor
|
||||
from torch.nn import Parameter
|
||||
|
||||
from colossalai.communication import all_reduce, broadcast
|
||||
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
|
||||
from colossalai.context import ParallelMode, seed
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.legacy.communication import all_reduce, broadcast
|
||||
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
|
||||
from colossalai.legacy.registry import LAYERS
|
||||
from colossalai.nn import init as init
|
||||
from colossalai.nn.layer.base_layer import ParallelLayer
|
||||
from colossalai.utils.checkpointing import (
|
||||
broadcast_state_dict,
|
||||
gather_tensor_parallel_state_dict,
|
|
@ -1,4 +1,4 @@
|
|||
from ._operation import RingQK, RingAV
|
||||
from ._operation import RingAV, RingQK
|
||||
from .layers import TransformerSelfAttentionRing
|
||||
|
||||
__all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
|
|
@ -3,13 +3,13 @@
|
|||
|
||||
import torch
|
||||
from torch import distributed as dist
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
|
||||
from colossalai.communication import ring_forward
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range
|
||||
from colossalai.legacy.communication import ring_forward
|
||||
from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
|
||||
from colossalai.utils import get_current_device
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
|
||||
|
||||
class RingQK(torch.autograd.Function):
|
|
@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
|
|||
from colossalai.core import global_context as gpc
|
||||
from colossalai.kernel import FusedScaleMaskSoftmax
|
||||
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
|
||||
from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
|
||||
from colossalai.legacy.registry import LAYERS
|
||||
from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
|
||||
|
||||
|
||||
@LAYERS.register_module
|
|
@ -0,0 +1,15 @@
|
|||
from .common import (
|
||||
ACT2FN,
|
||||
CheckpointModule,
|
||||
_ntuple,
|
||||
divide,
|
||||
get_tensor_parallel_mode,
|
||||
set_tensor_parallel_attribute_by_partition,
|
||||
set_tensor_parallel_attribute_by_size,
|
||||
to_2tuple,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
|
||||
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
|
||||
]
|
|
@ -6,10 +6,11 @@ from itertools import repeat
|
|||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import Tensor, nn
|
||||
|
||||
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.utils import checkpoint
|
||||
from torch import Tensor, nn
|
||||
|
||||
|
||||
class CheckpointModule(nn.Module):
|
|
@ -1,6 +1,8 @@
|
|||
import torch.nn as nn
|
||||
import torch.distributed as dist
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
from torch import nn
|
||||
from torch.nn.modules.loss import *
|
||||
from torch.nn.modules.loss import _Loss
|
||||
|
||||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
|
||||
|
||||
from .loss_1d import VocabParallelCrossEntropyLoss1D
|
||||
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
|
||||
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
|
||||
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
|
||||
|
||||
_parallel_cross_entropy = {
|
||||
'2d': CrossEntropyLoss2D,
|
||||
'2.5d': CrossEntropyLoss2p5D,
|
||||
'3d': CrossEntropyLoss3D,
|
||||
}
|
||||
|
||||
_vocab_parallel_cross_entropy = {
|
||||
'1d': VocabParallelCrossEntropyLoss1D,
|
||||
'2d': VocabParallelCrossEntropyLoss2D,
|
||||
'2.5d': VocabParallelCrossEntropyLoss2p5D,
|
||||
'3d': VocabParallelCrossEntropyLoss3D,
|
||||
}
|
||||
|
||||
|
||||
class CrossEntropyLoss(_Loss):
|
||||
|
||||
def __init__(self, reduction: bool = True, *args, **kwargs):
|
||||
super().__init__()
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel is not None and env.vocab_parallel:
|
||||
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
||||
elif tensor_parallel is None or tensor_parallel == '1d':
|
||||
reduction = 'mean' if reduction else 'none'
|
||||
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
|
||||
else:
|
||||
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
||||
|
||||
def forward(self, *args):
|
||||
return self.loss(*args)
|
|
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
|
|||
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
||||
from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
|
||||
from colossalai.legacy.registry import LOSSES
|
||||
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
||||
from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
|
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
|
|||
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
||||
from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
|
||||
from colossalai.legacy.registry import LOSSES
|
||||
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
||||
from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
|
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
|
|||
|
||||
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
||||
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
||||
from colossalai.legacy.registry import LOSSES
|
||||
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
||||
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
|
@ -1,26 +1,28 @@
|
|||
from torch import nn
|
||||
|
||||
from ._utils import calc_acc
|
||||
from .accuracy_2d import Accuracy2D
|
||||
from .accuracy_2p5d import Accuracy2p5D
|
||||
from .accuracy_3d import Accuracy3D
|
||||
from colossalai.nn.layer.utils import get_tensor_parallel_mode
|
||||
|
||||
_parallel_accuracy = {
|
||||
'2d': Accuracy2D,
|
||||
'2.5d': Accuracy2p5D,
|
||||
'3d': Accuracy3D,
|
||||
}
|
||||
|
||||
|
||||
class Accuracy(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel not in _parallel_accuracy:
|
||||
self.acc = calc_acc
|
||||
else:
|
||||
self.acc = _parallel_accuracy[tensor_parallel]()
|
||||
|
||||
def forward(self, *args):
|
||||
return self.acc(*args)
|
||||
from torch import nn
|
||||
|
||||
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
|
||||
|
||||
from ._utils import calc_acc
|
||||
from .accuracy_2d import Accuracy2D
|
||||
from .accuracy_2p5d import Accuracy2p5D
|
||||
from .accuracy_3d import Accuracy3D
|
||||
|
||||
_parallel_accuracy = {
|
||||
'2d': Accuracy2D,
|
||||
'2.5d': Accuracy2p5D,
|
||||
'3d': Accuracy3D,
|
||||
}
|
||||
|
||||
|
||||
class Accuracy(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel not in _parallel_accuracy:
|
||||
self.acc = calc_acc
|
||||
else:
|
||||
self.acc = _parallel_accuracy[tensor_parallel]()
|
||||
|
||||
def forward(self, *args):
|
||||
return self.acc(*args)
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
|
||||
|
||||
def calc_acc(logits, targets):
|
||||
preds = torch.argmax(logits, dim=-1)
|
||||
correct = torch.sum(targets == preds)
|
||||
return correct
|
||||
import torch
|
||||
|
||||
|
||||
def calc_acc(logits, targets):
|
||||
preds = torch.argmax(logits, dim=-1)
|
||||
correct = torch.sum(targets == preds)
|
||||
return correct
|
|
@ -1,7 +1,8 @@
|
|||
import torch
|
||||
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
||||
from torch import nn
|
||||
|
||||
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
||||
|
||||
from ._utils import calc_acc
|
||||
|
||||
|
|
@ -1,7 +1,8 @@
|
|||
import torch
|
||||
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
||||
from torch import nn
|
||||
|
||||
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
||||
|
||||
from ._utils import calc_acc
|
||||
|
||||
|
|
@ -1,33 +1,35 @@
|
|||
import torch
|
||||
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
||||
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
||||
from torch import nn
|
||||
|
||||
from ._utils import calc_acc
|
||||
|
||||
|
||||
class Accuracy3D(nn.Module):
|
||||
"""Accuracy for 3D parallelism
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
|
||||
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate the accuracy of predicted labels.
|
||||
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted labels.
|
||||
targets (:class:`torch.tensor`): True labels from data.
|
||||
|
||||
Returns:
|
||||
float: the accuracy of prediction.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
||||
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
|
||||
correct = calc_acc(logits, targets)
|
||||
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
|
||||
return correct
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
||||
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
||||
|
||||
from ._utils import calc_acc
|
||||
|
||||
|
||||
class Accuracy3D(nn.Module):
|
||||
"""Accuracy for 3D parallelism
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
|
||||
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
|
||||
|
||||
def forward(self, logits, targets):
|
||||
"""Calculate the accuracy of predicted labels.
|
||||
|
||||
Args:
|
||||
logits (:class:`torch.tensor`): Predicted labels.
|
||||
targets (:class:`torch.tensor`): True labels from data.
|
||||
|
||||
Returns:
|
||||
float: the accuracy of prediction.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
||||
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
|
||||
correct = calc_acc(logits, targets)
|
||||
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
|
||||
return correct
|
|
@ -1,10 +1,17 @@
|
|||
from .cache_embedding import (
|
||||
CachedEmbeddingBag,
|
||||
CachedParamMgr,
|
||||
EvictionStrategy,
|
||||
LimitBuffIndexCopyer,
|
||||
ParallelCachedEmbeddingBag,
|
||||
ParallelCachedEmbeddingBagTablewise,
|
||||
ParallelCachedEmbeddingBagTablewiseSpiltCache,
|
||||
TablewiseEmbeddingBagConfig,
|
||||
)
|
||||
from .colo_module import ColoModule
|
||||
from .linear import ColoLinear
|
||||
from .embedding import ColoEmbedding
|
||||
from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
|
||||
|
||||
from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
|
||||
ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
|
||||
from .linear import ColoLinear
|
||||
from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
|
||||
|
||||
__all__ = [
|
||||
'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
|
|
@ -1,8 +1,8 @@
|
|||
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
||||
from .copyer import LimitBuffIndexCopyer
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
|
||||
from .copyer import LimitBuffIndexCopyer
|
||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
|
||||
from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
|
||||
from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
import abc
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
|
|
@ -1,12 +1,14 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch.profiler import record_function
|
||||
from typing import List, Optional
|
||||
from contexttimer import Timer
|
||||
from .copyer import LimitBuffIndexCopyer
|
||||
from enum import Enum
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from contexttimer import Timer
|
||||
from torch.profiler import record_function
|
||||
|
||||
from .copyer import LimitBuffIndexCopyer
|
||||
|
||||
|
||||
class EvictionStrategy(Enum):
|
||||
|
@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
|
|||
class CachedParamMgr(torch.nn.Module):
|
||||
"""
|
||||
Manage Embedding Weights on CPU and CUDA memory uses a software cache.
|
||||
CPU maintains the entire original weight.
|
||||
CPU maintains the entire original weight.
|
||||
CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
|
||||
During training, GPU needs to transmit embedding rows between CPU and GPU.
|
||||
Args:
|
||||
|
@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
|
|||
self._elapsed_dict[name] += t.elapsed
|
||||
|
||||
def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
|
||||
"""_find_evict_gpu_idxs
|
||||
"""_find_evict_gpu_idxs
|
||||
Find the gpu idxs to be evicted, according to their freq.
|
||||
Args:
|
||||
evict_num (int): how many rows has to be evicted
|
||||
|
@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
|
|||
"""reorder
|
||||
reorder the weight according to ids' frequency in dataset before training.
|
||||
Execute only once before training, also known as warmup phase.
|
||||
|
||||
|
||||
Note:
|
||||
If you would like to use the DATASET as the eviction strategy, you must call this function.
|
||||
Note:
|
||||
|
@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
|
|||
"""
|
||||
deprecated
|
||||
evict one row from cuda to cpu.
|
||||
Returns:
|
||||
Returns:
|
||||
(int) : the slot id be evicted.
|
||||
"""
|
||||
mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
|
|
@ -1,10 +1,11 @@
|
|||
from typing import Iterator, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing import List, Optional, Iterator, Tuple, Union
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from .base_embedding import BaseEmbeddingBag
|
||||
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
|
||||
class CachedEmbeddingBag(BaseEmbeddingBag):
|
||||
|
@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
|||
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
||||
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
||||
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
||||
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
||||
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
||||
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
|
||||
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
||||
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
|
||||
|
@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
|||
buffer_size=50_000,
|
||||
pin_weight=False):
|
||||
"""
|
||||
Called after initialized.
|
||||
Called after initialized.
|
||||
Reorder the weight rows according to the ids_freq_mapping.
|
||||
Then, let the weights of the Module be managed by a CachedParamMgr.
|
||||
|
||||
|
||||
Args:
|
||||
cuda_row_num (int): number of rows can be hosted in CUDA memory
|
||||
ids_freq_mapping (List[int]): a list, idx is id number, value is freq
|
|
@ -3,7 +3,7 @@ from torch import LongTensor
|
|||
|
||||
|
||||
class LimitBuffIndexCopyer(object):
|
||||
"""LimitBuffIndexCopyer
|
||||
"""LimitBuffIndexCopyer
|
||||
Index Copy using limited temp buffer on CUDA.
|
||||
|
||||
Args:
|
||||
|
@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
|
|||
|
||||
@torch.no_grad()
|
||||
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
|
||||
"""copy
|
||||
"""copy
|
||||
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
|
||||
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
|
||||
|
|
@ -1,12 +1,13 @@
|
|||
from typing import Iterator, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from typing import List, Optional, Iterator, Tuple
|
||||
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
from colossalai.nn._ops._utils import dual_all_to_all
|
||||
from colossalai.legacy.nn._ops._utils import dual_all_to_all
|
||||
from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
|
||||
|
||||
from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
|
||||
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
|
||||
|
||||
def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
|
|
@ -1,15 +1,16 @@
|
|||
import time
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
from .cache_mgr import EvictionStrategy
|
||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||
from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
|
||||
|
||||
from typing import List
|
||||
import time
|
||||
from .cache_mgr import EvictionStrategy
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||
|
||||
|
||||
class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
|
|
@ -1,17 +1,17 @@
|
|||
import abc
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from torch.profiler import record_function
|
||||
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
|
||||
from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
|
||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||
from .cache_mgr import EvictionStrategy
|
||||
|
||||
from typing import List
|
||||
import abc
|
||||
from .cache_mgr import EvictionStrategy
|
||||
from .cached_embedding import CachedEmbeddingBag
|
||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||
|
||||
|
||||
class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
|
|
@ -1,6 +1,7 @@
|
|||
from colossalai.tensor.distspec import _DistSpec
|
||||
from typing import Dict, List
|
||||
|
||||
from colossalai.tensor import ComputePattern
|
||||
from typing import List, Dict
|
||||
from colossalai.tensor.distspec import _DistSpec
|
||||
|
||||
|
||||
class ColoModule(object):
|
|
@ -1,5 +1,6 @@
|
|||
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
|
||||
|
||||
from .colo_module import ColoModule
|
||||
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
|
||||
|
||||
|
||||
class ColoEmbedding(ColoModule):
|
|
@ -1,5 +1,6 @@
|
|||
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
|
||||
|
||||
from .colo_module import ColoModule
|
||||
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
|
||||
|
||||
|
||||
class ColoLinear(ColoModule):
|
|
@ -1,9 +1,11 @@
|
|||
from typing import Dict
|
||||
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
|
||||
from colossalai.tensor import distspec
|
||||
from . import ColoModule
|
||||
|
||||
import torch
|
||||
|
||||
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
|
||||
|
||||
from . import ColoModule
|
||||
|
||||
_COLOSSAL_MODULES: Dict[type, ColoModule] = {}
|
||||
|
||||
|
|
@ -7,9 +7,9 @@ from typing import Callable
|
|||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from colossalai.communication import all_reduce
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.communication import all_reduce
|
||||
from colossalai.legacy.registry import HOOKS
|
||||
from colossalai.utils import get_current_device, is_no_pp_or_last_stage
|
||||
|
||||
|
|
|
@ -6,8 +6,7 @@ import logging
|
|||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
import colossalai
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
import torch.distributed as dist
|
||||
|
||||
|
||||
class DistributedLogger:
|
||||
|
@ -63,6 +62,7 @@ class DistributedLogger:
|
|||
self._logger.propagate = False
|
||||
|
||||
DistributedLogger.__instances[name] = self
|
||||
self.rank = dist.get_rank() if dist.is_initialized() else 0
|
||||
|
||||
@staticmethod
|
||||
def __get_call_info():
|
||||
|
@ -109,16 +109,10 @@ class DistributedLogger:
|
|||
# create log directory
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# set the default file name if path is a directory
|
||||
if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
|
||||
rank = 0
|
||||
else:
|
||||
rank = colossalai.core.global_context.get_global_rank()
|
||||
|
||||
if suffix is not None:
|
||||
log_file_name = f'rank_{rank}_{suffix}.log'
|
||||
log_file_name = f'rank_{self.rank}_{suffix}.log'
|
||||
else:
|
||||
log_file_name = f'rank_{rank}.log'
|
||||
log_file_name = f'rank_{self.rank}.log'
|
||||
path = path.joinpath(log_file_name)
|
||||
|
||||
# add file handler
|
||||
|
@ -128,19 +122,14 @@ class DistributedLogger:
|
|||
file_handler.setFormatter(formatter)
|
||||
self._logger.addHandler(file_handler)
|
||||
|
||||
def _log(self,
|
||||
level,
|
||||
message: str,
|
||||
parallel_mode: ParallelMode = ParallelMode.GLOBAL,
|
||||
ranks: List[int] = None) -> None:
|
||||
def _log(self, level, message: str, ranks: List[int] = None) -> None:
|
||||
if ranks is None:
|
||||
getattr(self._logger, level)(message)
|
||||
else:
|
||||
local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
|
||||
if local_rank in ranks:
|
||||
if self.rank in ranks:
|
||||
getattr(self._logger, level)(message)
|
||||
|
||||
def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
||||
def info(self, message: str, ranks: List[int] = None) -> None:
|
||||
"""Log an info message.
|
||||
|
||||
Args:
|
||||
|
@ -150,10 +139,10 @@ class DistributedLogger:
|
|||
ranks (List[int]): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('info', message_prefix, parallel_mode, ranks)
|
||||
self._log('info', message, parallel_mode, ranks)
|
||||
self._log('info', message_prefix, ranks)
|
||||
self._log('info', message, ranks)
|
||||
|
||||
def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
||||
def warning(self, message: str, ranks: List[int] = None) -> None:
|
||||
"""Log a warning message.
|
||||
|
||||
Args:
|
||||
|
@ -163,10 +152,10 @@ class DistributedLogger:
|
|||
ranks (List[int]): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('warning', message_prefix, parallel_mode, ranks)
|
||||
self._log('warning', message, parallel_mode, ranks)
|
||||
self._log('warning', message_prefix, ranks)
|
||||
self._log('warning', message, ranks)
|
||||
|
||||
def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
||||
def debug(self, message: str, ranks: List[int] = None) -> None:
|
||||
"""Log a debug message.
|
||||
|
||||
Args:
|
||||
|
@ -176,10 +165,10 @@ class DistributedLogger:
|
|||
ranks (List[int]): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('debug', message_prefix, parallel_mode, ranks)
|
||||
self._log('debug', message, parallel_mode, ranks)
|
||||
self._log('debug', message_prefix, ranks)
|
||||
self._log('debug', message, ranks)
|
||||
|
||||
def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
||||
def error(self, message: str, ranks: List[int] = None) -> None:
|
||||
"""Log an error message.
|
||||
|
||||
Args:
|
||||
|
@ -189,5 +178,5 @@ class DistributedLogger:
|
|||
ranks (List[int]): List of parallel ranks.
|
||||
"""
|
||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||
self._log('error', message_prefix, parallel_mode, ranks)
|
||||
self._log('error', message, parallel_mode, ranks)
|
||||
self._log('error', message_prefix, ranks)
|
||||
self._log('error', message, ranks)
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from ._ops import *
|
||||
from .init import *
|
||||
from .layer import *
|
||||
from .loss import *
|
||||
from .lr_scheduler import *
|
||||
from .metric import *
|
||||
from .optimizer import *
|
||||
|
|
|
@ -1,10 +1,2 @@
|
|||
from .colossalai_layer import *
|
||||
from .parallel_1d import *
|
||||
from .parallel_2d import *
|
||||
from .parallel_2p5d import *
|
||||
from .parallel_3d import *
|
||||
from .parallel_sequence import *
|
||||
from .moe import *
|
||||
from .utils import *
|
||||
from .vanilla import *
|
||||
from .wrapper import *
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
|
||||
PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
|
||||
|
||||
__all__ = [
|
||||
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
|
||||
'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
|
||||
]
|
|
@ -0,0 +1,14 @@
|
|||
def divide(numerator, denominator):
|
||||
"""Only allow exact division.
|
||||
|
||||
Args:
|
||||
numerator (int): Numerator of the division.
|
||||
denominator (int): Denominator of the division.
|
||||
|
||||
Returns:
|
||||
int: the result of exact division.
|
||||
"""
|
||||
assert denominator != 0, 'denominator can not be zero'
|
||||
assert numerator % denominator == 0, \
|
||||
'{} is not divisible by {}'.format(numerator, denominator)
|
||||
return numerator // denominator
|
|
@ -1,7 +0,0 @@
|
|||
from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
|
||||
set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
|
||||
|
||||
__all__ = [
|
||||
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
|
||||
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
|
||||
]
|
|
@ -1,41 +1 @@
|
|||
from colossalai.global_variables import tensor_parallel_env as env
|
||||
from colossalai.nn.layer.utils import get_tensor_parallel_mode
|
||||
from torch import nn
|
||||
from torch.nn.modules.loss import *
|
||||
from torch.nn.modules.loss import _Loss
|
||||
|
||||
from .loss_1d import VocabParallelCrossEntropyLoss1D
|
||||
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
|
||||
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
|
||||
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
|
||||
from .loss_moe import MoeCrossEntropyLoss, MoeLoss
|
||||
|
||||
_parallel_cross_entropy = {
|
||||
'2d': CrossEntropyLoss2D,
|
||||
'2.5d': CrossEntropyLoss2p5D,
|
||||
'3d': CrossEntropyLoss3D,
|
||||
}
|
||||
|
||||
_vocab_parallel_cross_entropy = {
|
||||
'1d': VocabParallelCrossEntropyLoss1D,
|
||||
'2d': VocabParallelCrossEntropyLoss2D,
|
||||
'2.5d': VocabParallelCrossEntropyLoss2p5D,
|
||||
'3d': VocabParallelCrossEntropyLoss3D,
|
||||
}
|
||||
|
||||
|
||||
class CrossEntropyLoss(_Loss):
|
||||
|
||||
def __init__(self, reduction: bool = True, *args, **kwargs):
|
||||
super().__init__()
|
||||
tensor_parallel = get_tensor_parallel_mode()
|
||||
if tensor_parallel is not None and env.vocab_parallel:
|
||||
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
||||
elif tensor_parallel is None or tensor_parallel == '1d':
|
||||
reduction = 'mean' if reduction else 'none'
|
||||
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
|
||||
else:
|
||||
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
||||
|
||||
def forward(self, *args):
|
||||
return self.loss(*args)
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
|
||||
|
||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
||||
|
||||
from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class CosineAnnealingLR(_CosineAnnealingLR):
|
||||
r"""Set the learning rate of each parameter group using a cosine annealing
|
||||
schedule, where :math:`\eta_{max}` is set to the initial lr and
|
||||
|
@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
|
|||
super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class CosineAnnealingWarmupLR(WarmupScheduler):
|
||||
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
|
||||
|
||||
|
@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
|
|||
super().__init__(optimizer, warmup_steps, base_scheduler)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class FlatAnnealingLR(DelayerScheduler):
|
||||
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
|
||||
|
||||
|
@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
|
|||
super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
|
||||
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
|
||||
applied, and then the learning rate will be a fixed value before starting decay.
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
|
||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class LinearWarmupLR(_LRScheduler):
|
||||
"""Linearly warmup learning rate and then linearly decay.
|
||||
|
||||
|
|
|
@ -2,12 +2,9 @@ from typing import List
|
|||
|
||||
from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
|
||||
|
||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
||||
|
||||
from .delayed import WarmupScheduler
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class MultiStepLR(_MultiStepLR):
|
||||
"""Decays the learning rate of each parameter group by gamma once the
|
||||
number of epoch reaches one of the milestones. Notice that such decay can
|
||||
|
@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
|
|||
super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class MultiStepWarmupLR(WarmupScheduler):
|
||||
"""Multistep learning rate scheduler with warmup.
|
||||
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
|
||||
|
||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class OneCycleLR(_OneCycleLR):
|
||||
r"""Sets the learning rate of each parameter group according to the
|
||||
1cycle learning rate policy. The 1cycle policy anneals the learning
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
|
||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
||||
|
||||
from .delayed import WarmupScheduler
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class PolynomialLR(_LRScheduler):
|
||||
"""Polynomial learning rate scheduler.
|
||||
|
||||
|
@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
|
|||
for base_lr in self.base_lrs]
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class PolynomialWarmupLR(WarmupScheduler):
|
||||
"""Polynomial learning rate scheduler with warmup.
|
||||
|
||||
|
|
|
@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
|
|||
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
|
||||
from torch.optim.lr_scheduler import StepLR as _StepLR
|
||||
|
||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class LambdaLR(_LambdaLR):
|
||||
"""Sets the learning rate of each parameter group to the initial lr
|
||||
times a given function. When last_epoch=-1, sets initial lr as lr.
|
||||
|
@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
|
|||
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class MultiplicativeLR(_MultiplicativeLR):
|
||||
"""Multiply the learning rate of each parameter group by the factor given
|
||||
in the specified function. When last_epoch=-1, sets initial lr as lr.
|
||||
|
@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
|
|||
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class StepLR(_StepLR):
|
||||
"""Decays the learning rate of each parameter group by gamma every
|
||||
step_size epochs. Notice that such decay can happen simultaneously with
|
||||
|
@ -61,7 +56,6 @@ class StepLR(_StepLR):
|
|||
super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
|
||||
|
||||
|
||||
@LR_SCHEDULERS.register_module
|
||||
class ExponentialLR(_ExponentialLR):
|
||||
"""Decays the learning rate of each parameter group by gamma every epoch.
|
||||
When last_epoch=-1, sets initial lr as lr
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue