mirror of https://github.com/hpcaitech/ColossalAI
[legacy] move communication and nn to legacy and refactor logger (#4671)
* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc checkpull/4692/head
parent
536397cc95
commit
554aa9592e
|
@ -4,7 +4,7 @@ from typing import Optional, Set
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from colossalai.nn.parallel.data_parallel import _cast_float
|
from colossalai.utils import _cast_float
|
||||||
from colossalai.zero.legacy.gemini.tensor_utils import free_storage
|
from colossalai.zero.legacy.gemini.tensor_utils import free_storage
|
||||||
|
|
||||||
from .region_manager import RegionManager
|
from .region_manager import RegionManager
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
class Registry:
|
class Registry:
|
||||||
# TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here
|
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
|
@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch.optim import Optimizer
|
from torch.optim import Optimizer
|
||||||
from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
|
|
||||||
from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
|
|
||||||
|
|
||||||
from colossalai.interface import ModelWrapper, OptimizerWrapper
|
from colossalai.interface import ModelWrapper, OptimizerWrapper
|
||||||
from colossalai.nn.optimizer import ColossalaiOptimizer
|
from colossalai.nn.optimizer import ColossalaiOptimizer
|
||||||
|
@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T
|
||||||
checkpoint_path (str): Path to the checkpoint directory.
|
checkpoint_path (str): Path to the checkpoint directory.
|
||||||
is_master (bool): Whether current rank is main process.
|
is_master (bool): Whether current rank is main process.
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
|
from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype
|
||||||
|
from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model
|
||||||
|
except ImportError:
|
||||||
|
return
|
||||||
if not isinstance(model, PreTrainedModel):
|
if not isinstance(model, PreTrainedModel):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import colossalai.nn as col_nn
|
import colossalai.legacy.nn as col_nn
|
||||||
|
|
||||||
|
|
||||||
class MLP(torch.nn.Module):
|
class MLP(torch.nn.Module):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from colossalai.nn.layer.colossalai_layer import Embedding, Linear
|
from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
from .bias_dropout_add import bias_dropout_add_fused_train
|
from .bias_dropout_add import bias_dropout_add_fused_train
|
||||||
|
|
|
@ -1,9 +1,17 @@
|
||||||
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
|
from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter
|
||||||
from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
|
from .p2p import (
|
||||||
send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
|
recv_backward,
|
||||||
recv_forward, recv_backward)
|
recv_forward,
|
||||||
|
send_backward,
|
||||||
|
send_backward_recv_backward,
|
||||||
|
send_backward_recv_forward,
|
||||||
|
send_forward,
|
||||||
|
send_forward_backward_recv_forward_backward,
|
||||||
|
send_forward_recv_backward,
|
||||||
|
send_forward_recv_forward,
|
||||||
|
)
|
||||||
from .ring import ring_forward
|
from .ring import ring_forward
|
||||||
from .utils import send_obj_meta, recv_obj_meta
|
from .utils import recv_obj_meta, send_obj_meta
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'all_gather',
|
'all_gather',
|
|
@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union
|
||||||
|
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
|
|
||||||
import colossalai.communication as comm
|
import colossalai.legacy.communication as comm
|
||||||
from colossalai.amp.naive_amp import NaiveAMPModel
|
from colossalai.amp.naive_amp import NaiveAMPModel
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
|
|
@ -5,10 +5,10 @@ from typing import Iterable, Tuple
|
||||||
|
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
|
|
||||||
import colossalai.communication.p2p_v2 as comm
|
import colossalai.legacy.communication.p2p_v2 as comm
|
||||||
from colossalai import engine
|
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.engine import Engine
|
||||||
from colossalai.utils.cuda import get_current_device
|
from colossalai.utils.cuda import get_current_device
|
||||||
|
|
||||||
from ._pipeline_schedule import PipelineSchedule
|
from ._pipeline_schedule import PipelineSchedule
|
||||||
|
@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def forward_backward_step(self,
|
def forward_backward_step(self,
|
||||||
engine: engine.Engine,
|
engine: Engine,
|
||||||
data_iter: Iterable,
|
data_iter: Iterable,
|
||||||
forward_only=False,
|
forward_only=False,
|
||||||
return_loss=True,
|
return_loss=True,
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
from ._ops import *
|
||||||
|
from .layer import *
|
||||||
|
from .loss import *
|
||||||
|
from .metric import *
|
|
@ -4,7 +4,7 @@ import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
from colossalai.nn.layer.utils import divide
|
from colossalai.legacy.nn.layer.utils import divide
|
||||||
from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
|
from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
|
||||||
|
|
||||||
GeneralTensor = Union[ColoTensor, torch.Tensor]
|
GeneralTensor = Union[ColoTensor, torch.Tensor]
|
||||||
|
@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int):
|
||||||
return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
|
return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim)
|
||||||
|
|
||||||
|
|
||||||
### table wise embedding shard
|
# table wise embedding shard
|
||||||
|
|
||||||
|
|
||||||
def _all_to_all_for_tablewise(x: torch.Tensor,
|
def _all_to_all_for_tablewise(x: torch.Tensor,
|
|
@ -1,8 +1,10 @@
|
||||||
import torch.nn.functional as F
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec
|
||||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||||
from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \
|
|
||||||
ReplicaSpec
|
|
||||||
from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
|
from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
import torch.nn.functional as F
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import torch.nn.functional as F
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
|
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
|
||||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||||
from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
|
|
||||||
ShardSpec, ReplicaSpec
|
|
||||||
from ._utils import GeneralTensor, convert_to_colo_tensor
|
from ._utils import GeneralTensor, convert_to_colo_tensor
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
|
||||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||||
from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
|
|
||||||
from ._utils import GeneralTensor, convert_to_colo_tensor
|
from ._utils import GeneralTensor, convert_to_colo_tensor
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from typing import Optional
|
|
||||||
from colossalai.tensor.op_wrapper import colo_op_impl
|
from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
|
||||||
from colossalai.tensor import ColoTensor, ColoTensorSpec
|
from colossalai.tensor import ColoTensor, ColoTensorSpec
|
||||||
from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
|
from colossalai.tensor.op_wrapper import colo_op_impl
|
||||||
|
|
||||||
from ._utils import GeneralTensor, convert_to_colo_tensor
|
from ._utils import GeneralTensor, convert_to_colo_tensor
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
from .colossalai_layer import *
|
||||||
|
from .parallel_1d import *
|
||||||
|
from .parallel_2d import *
|
||||||
|
from .parallel_2p5d import *
|
||||||
|
from .parallel_3d import *
|
||||||
|
from .parallel_sequence import *
|
||||||
|
from .utils import *
|
||||||
|
from .vanilla import *
|
||||||
|
from .wrapper import *
|
|
@ -1,7 +1,7 @@
|
||||||
from ._utils import partition_batch
|
from ._utils import partition_batch
|
||||||
from .dropout import Dropout
|
from .dropout import Dropout
|
||||||
from .embedding import Embedding, PatchEmbedding
|
from .embedding import Embedding, PatchEmbedding
|
||||||
from .linear import Classifier, Linear
|
from .linear import Classifier, Linear
|
||||||
from .normalization import LayerNorm
|
from .normalization import LayerNorm
|
||||||
|
|
||||||
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
|
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
|
|
@ -1,151 +1,152 @@
|
||||||
import math
|
import math
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
from colossalai.utils import get_current_device
|
from torch import dtype, nn
|
||||||
from torch import dtype, nn
|
|
||||||
|
from colossalai.nn import init
|
||||||
from ... import init as init
|
from colossalai.utils import get_current_device
|
||||||
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
|
|
||||||
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
|
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
|
||||||
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
|
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
|
||||||
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
|
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
|
||||||
from ..utils import get_tensor_parallel_mode
|
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
|
||||||
from ..vanilla import VanillaPatchEmbedding
|
from ..utils import get_tensor_parallel_mode
|
||||||
from ._utils import ColossalaiModule
|
from ..vanilla import VanillaPatchEmbedding
|
||||||
|
from ._utils import ColossalaiModule
|
||||||
_parallel_embedding = {
|
|
||||||
'1d': Embedding1D,
|
_parallel_embedding = {
|
||||||
'2d': Embedding2D,
|
'1d': Embedding1D,
|
||||||
'2.5d': Embedding2p5D,
|
'2d': Embedding2D,
|
||||||
'3d': Embedding3D,
|
'2.5d': Embedding2p5D,
|
||||||
}
|
'3d': Embedding3D,
|
||||||
|
}
|
||||||
_vocab_parallel_embedding = {
|
|
||||||
'1d': VocabParallelEmbedding1D,
|
_vocab_parallel_embedding = {
|
||||||
'2d': VocabParallelEmbedding2D,
|
'1d': VocabParallelEmbedding1D,
|
||||||
'2.5d': VocabParallelEmbedding2p5D,
|
'2d': VocabParallelEmbedding2D,
|
||||||
'3d': VocabParallelEmbedding3D
|
'2.5d': VocabParallelEmbedding2p5D,
|
||||||
}
|
'3d': VocabParallelEmbedding3D
|
||||||
|
}
|
||||||
_parallel_patchembedding = {
|
|
||||||
None: VanillaPatchEmbedding,
|
_parallel_patchembedding = {
|
||||||
'1d': PatchEmbedding1D,
|
None: VanillaPatchEmbedding,
|
||||||
'2d': PatchEmbedding2D,
|
'1d': PatchEmbedding1D,
|
||||||
'2.5d': PatchEmbedding2p5D,
|
'2d': PatchEmbedding2D,
|
||||||
'3d': PatchEmbedding3D
|
'2.5d': PatchEmbedding2p5D,
|
||||||
}
|
'3d': PatchEmbedding3D
|
||||||
|
}
|
||||||
|
|
||||||
class Embedding(ColossalaiModule):
|
|
||||||
r"""Embedding for colossalai.
|
class Embedding(ColossalaiModule):
|
||||||
|
r"""Embedding for colossalai.
|
||||||
Args:
|
|
||||||
num_embeddings (int): number of embeddings.
|
Args:
|
||||||
embedding_dim (int): dimension of embedding.
|
num_embeddings (int): number of embeddings.
|
||||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
embedding_dim (int): dimension of embedding.
|
||||||
therefore, the embedding vector at padding_idx is not updated during training,
|
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
|
||||||
i.e. it remains as a fixed “pad”, defaults to None.
|
therefore, the embedding vector at padding_idx is not updated during training,
|
||||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
i.e. it remains as a fixed “pad”, defaults to None.
|
||||||
weight_initializer (:class:`typing.Callable`, optional):
|
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||||
he initializer of weight, defaults to normal initializer.
|
weight_initializer (:class:`typing.Callable`, optional):
|
||||||
|
he initializer of weight, defaults to normal initializer.
|
||||||
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
|
|
||||||
::
|
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
|
||||||
|
::
|
||||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
|
||||||
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
|
||||||
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
renormalized to have norm max_norm. Note: this will modify weight in-place.
|
||||||
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
|
||||||
of frequency of the words in the mini-batch. Default False.
|
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
|
||||||
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
of frequency of the words in the mini-batch. Default False.
|
||||||
|
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
|
||||||
More details about ``args`` and ``kwargs`` could be found in
|
|
||||||
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
More details about ``args`` and ``kwargs`` could be found in
|
||||||
|
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
|
||||||
More details about ``initializer`` please refer to
|
|
||||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
More details about ``initializer`` please refer to
|
||||||
"""
|
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
|
||||||
|
"""
|
||||||
def __init__(self,
|
|
||||||
num_embeddings: int,
|
def __init__(self,
|
||||||
embedding_dim: int,
|
num_embeddings: int,
|
||||||
padding_idx: int = None,
|
embedding_dim: int,
|
||||||
dtype: dtype = None,
|
padding_idx: int = None,
|
||||||
weight_initializer: Callable = init.normal_(),
|
dtype: dtype = None,
|
||||||
vocab_parallel_limit: int = 2048,
|
weight_initializer: Callable = init.normal_(),
|
||||||
*args,
|
vocab_parallel_limit: int = 2048,
|
||||||
**kwargs) -> None:
|
*args,
|
||||||
tensor_parallel = get_tensor_parallel_mode()
|
**kwargs) -> None:
|
||||||
if tensor_parallel is None:
|
tensor_parallel = get_tensor_parallel_mode()
|
||||||
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
|
if tensor_parallel is None:
|
||||||
**kwargs).to(dtype).to(get_current_device())
|
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
|
||||||
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
|
**kwargs).to(dtype).to(get_current_device())
|
||||||
elif num_embeddings <= vocab_parallel_limit:
|
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
|
||||||
embed = _parallel_embedding[tensor_parallel](
|
elif num_embeddings <= vocab_parallel_limit:
|
||||||
num_embeddings,
|
embed = _parallel_embedding[tensor_parallel](
|
||||||
embedding_dim,
|
num_embeddings,
|
||||||
padding_idx=padding_idx,
|
embedding_dim,
|
||||||
dtype=dtype,
|
padding_idx=padding_idx,
|
||||||
weight_initializer=weight_initializer,
|
dtype=dtype,
|
||||||
*args,
|
weight_initializer=weight_initializer,
|
||||||
**kwargs,
|
*args,
|
||||||
)
|
**kwargs,
|
||||||
else:
|
)
|
||||||
embed = _vocab_parallel_embedding[tensor_parallel](
|
else:
|
||||||
num_embeddings,
|
embed = _vocab_parallel_embedding[tensor_parallel](
|
||||||
embedding_dim,
|
num_embeddings,
|
||||||
padding_idx=padding_idx,
|
embedding_dim,
|
||||||
dtype=dtype,
|
padding_idx=padding_idx,
|
||||||
weight_initializer=weight_initializer,
|
dtype=dtype,
|
||||||
*args,
|
weight_initializer=weight_initializer,
|
||||||
**kwargs,
|
*args,
|
||||||
)
|
**kwargs,
|
||||||
super().__init__(embed)
|
)
|
||||||
|
super().__init__(embed)
|
||||||
|
|
||||||
class PatchEmbedding(ColossalaiModule):
|
|
||||||
"""2D Image to Patch Embedding.
|
class PatchEmbedding(ColossalaiModule):
|
||||||
|
"""2D Image to Patch Embedding.
|
||||||
Args:
|
|
||||||
img_size (int): image size.
|
Args:
|
||||||
patch_size (int): patch size.
|
img_size (int): image size.
|
||||||
in_chans (int): number of channels of input image.
|
patch_size (int): patch size.
|
||||||
embed_size (int): size of embedding.
|
in_chans (int): number of channels of input image.
|
||||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
embed_size (int): size of embedding.
|
||||||
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||||
weight_initializer (:class:`typing.Callable`, optional):
|
flatten (bool, optional): whether to flatten output tensor, defaults to True.
|
||||||
The initializer of weight, defaults to kaiming uniform initializer.
|
weight_initializer (:class:`typing.Callable`, optional):
|
||||||
bias_initializer (:class:`typing.Callable`, optional):
|
The initializer of weight, defaults to kaiming uniform initializer.
|
||||||
The initializer of bias, defaults to xavier uniform initializer.
|
bias_initializer (:class:`typing.Callable`, optional):
|
||||||
position_embed_initializer (:class:`typing.Callable`, optional):
|
The initializer of bias, defaults to xavier uniform initializer.
|
||||||
The initializer of position embedding, defaults to zeros initializer.
|
position_embed_initializer (:class:`typing.Callable`, optional):
|
||||||
|
The initializer of position embedding, defaults to zeros initializer.
|
||||||
More details about ``initializer`` please refer to
|
|
||||||
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
More details about ``initializer`` please refer to
|
||||||
"""
|
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
|
||||||
|
"""
|
||||||
def __init__(
|
|
||||||
self,
|
def __init__(
|
||||||
img_size: int,
|
self,
|
||||||
patch_size: int,
|
img_size: int,
|
||||||
in_chans: int,
|
patch_size: int,
|
||||||
embed_size: int,
|
in_chans: int,
|
||||||
dtype: dtype = None,
|
embed_size: int,
|
||||||
flatten: bool = True,
|
dtype: dtype = None,
|
||||||
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
|
flatten: bool = True,
|
||||||
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
|
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
|
||||||
position_embed_initializer: Callable = init.zeros_()
|
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
|
||||||
) -> None:
|
position_embed_initializer: Callable = init.zeros_()
|
||||||
tensor_parallel = get_tensor_parallel_mode()
|
) -> None:
|
||||||
embed = _parallel_patchembedding[tensor_parallel](
|
tensor_parallel = get_tensor_parallel_mode()
|
||||||
img_size,
|
embed = _parallel_patchembedding[tensor_parallel](
|
||||||
patch_size,
|
img_size,
|
||||||
in_chans,
|
patch_size,
|
||||||
embed_size,
|
in_chans,
|
||||||
dtype=dtype,
|
embed_size,
|
||||||
flatten=flatten,
|
dtype=dtype,
|
||||||
weight_initializer=weight_initializer,
|
flatten=flatten,
|
||||||
bias_initializer=bias_initializer,
|
weight_initializer=weight_initializer,
|
||||||
position_embed_initializer=position_embed_initializer,
|
bias_initializer=bias_initializer,
|
||||||
)
|
position_embed_initializer=position_embed_initializer,
|
||||||
super().__init__(embed)
|
)
|
||||||
|
super().__init__(embed)
|
|
@ -4,9 +4,9 @@ from typing import Callable
|
||||||
|
|
||||||
from torch import dtype, nn
|
from torch import dtype, nn
|
||||||
|
|
||||||
|
from colossalai.nn import init
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
from ... import init as init
|
|
||||||
from ..parallel_1d import *
|
from ..parallel_1d import *
|
||||||
from ..parallel_2d import *
|
from ..parallel_2d import *
|
||||||
from ..parallel_2p5d import *
|
from ..parallel_2p5d import *
|
|
@ -1,41 +1,42 @@
|
||||||
from colossalai.utils import get_current_device
|
from torch import nn
|
||||||
from torch import nn
|
|
||||||
|
from colossalai.utils import get_current_device
|
||||||
from ..parallel_1d import LayerNorm1D
|
|
||||||
from ..parallel_2d import LayerNorm2D
|
from ..parallel_1d import LayerNorm1D
|
||||||
from ..parallel_2p5d import LayerNorm2p5D
|
from ..parallel_2d import LayerNorm2D
|
||||||
from ..parallel_3d import LayerNorm3D
|
from ..parallel_2p5d import LayerNorm2p5D
|
||||||
from ..utils import get_tensor_parallel_mode
|
from ..parallel_3d import LayerNorm3D
|
||||||
from ..vanilla import VanillaLayerNorm
|
from ..utils import get_tensor_parallel_mode
|
||||||
from ._utils import ColossalaiModule
|
from ..vanilla import VanillaLayerNorm
|
||||||
|
from ._utils import ColossalaiModule
|
||||||
_parallel_layernorm = {
|
|
||||||
None: VanillaLayerNorm,
|
_parallel_layernorm = {
|
||||||
"1d": LayerNorm1D,
|
None: VanillaLayerNorm,
|
||||||
"2d": LayerNorm2D,
|
"1d": LayerNorm1D,
|
||||||
"2.5d": LayerNorm2p5D,
|
"2d": LayerNorm2D,
|
||||||
"3d": LayerNorm3D,
|
"2.5d": LayerNorm2p5D,
|
||||||
}
|
"3d": LayerNorm3D,
|
||||||
|
}
|
||||||
|
|
||||||
class LayerNorm(ColossalaiModule):
|
|
||||||
r"""Layer Normalization for colossalai.
|
class LayerNorm(ColossalaiModule):
|
||||||
|
r"""Layer Normalization for colossalai.
|
||||||
Args:
|
|
||||||
normalized_shape (int): input shape from an expected input of size.
|
Args:
|
||||||
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
normalized_shape (int): input shape from an expected input of size.
|
||||||
\times \ldots \times \text{normalized_shape}[-1]]`
|
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
|
||||||
If a single integer is used, it is treated as a singleton list, and this module will
|
\times \ldots \times \text{normalized_shape}[-1]]`
|
||||||
normalize over the last dimension which is expected to be of that specific size.
|
If a single integer is used, it is treated as a singleton list, and this module will
|
||||||
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
|
normalize over the last dimension which is expected to be of that specific size.
|
||||||
bias (bool, optional): Whether to add a bias, defaults to ``True``.
|
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
|
||||||
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
bias (bool, optional): Whether to add a bias, defaults to ``True``.
|
||||||
"""
|
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
|
||||||
|
"""
|
||||||
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
|
|
||||||
tensor_parallel = get_tensor_parallel_mode()
|
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
|
||||||
if tensor_parallel is None:
|
tensor_parallel = get_tensor_parallel_mode()
|
||||||
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
|
if tensor_parallel is None:
|
||||||
else:
|
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
|
||||||
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
|
else:
|
||||||
super().__init__(norm)
|
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
|
||||||
|
super().__init__(norm)
|
|
@ -0,0 +1,17 @@
|
||||||
|
from .layers import (
|
||||||
|
Classifier1D,
|
||||||
|
Dropout1D,
|
||||||
|
Embedding1D,
|
||||||
|
LayerNorm1D,
|
||||||
|
Linear1D,
|
||||||
|
Linear1D_Col,
|
||||||
|
Linear1D_Row,
|
||||||
|
PatchEmbedding1D,
|
||||||
|
VocabParallelClassifier1D,
|
||||||
|
VocabParallelEmbedding1D,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
|
||||||
|
'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
|
||||||
|
]
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
|
|
||||||
|
@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
|
||||||
class _SplitForwardGatherBackward(torch.autograd.Function):
|
class _SplitForwardGatherBackward(torch.autograd.Function):
|
||||||
"""
|
"""
|
||||||
Split the input and keep only the corresponding chuck to the rank.
|
Split the input and keep only the corresponding chuck to the rank.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_: input matrix.
|
input_: input matrix.
|
||||||
parallel_mode: parallel mode.
|
parallel_mode: parallel mode.
|
|
@ -10,11 +10,11 @@ import torch.nn.functional as F
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from colossalai.communication import broadcast
|
|
||||||
from colossalai.context import ParallelMode, seed
|
from colossalai.context import ParallelMode, seed
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
from colossalai.kernel import LayerNorm
|
from colossalai.kernel import LayerNorm
|
||||||
|
from colossalai.legacy.communication import broadcast
|
||||||
from colossalai.legacy.registry import LAYERS
|
from colossalai.legacy.registry import LAYERS
|
||||||
from colossalai.nn import init as init
|
from colossalai.nn import init as init
|
||||||
from colossalai.utils.checkpointing import (
|
from colossalai.utils.checkpointing import (
|
|
@ -1,6 +1,13 @@
|
||||||
from ._operation import reduce_by_batch_2d, split_batch_2d
|
from ._operation import reduce_by_batch_2d, split_batch_2d
|
||||||
from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
|
from .layers import (
|
||||||
VocabParallelEmbedding2D)
|
Classifier2D,
|
||||||
|
Embedding2D,
|
||||||
|
LayerNorm2D,
|
||||||
|
Linear2D,
|
||||||
|
PatchEmbedding2D,
|
||||||
|
VocabParallelClassifier2D,
|
||||||
|
VocabParallelEmbedding2D,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
|
'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
|
|
@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
|
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
|
||||||
from colossalai.core import global_context as gpc
|
|
||||||
from colossalai.utils import get_current_device
|
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||||
|
|
||||||
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
|
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
|
||||||
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
|
|
||||||
def matmul_2d(
|
def matmul_2d(
|
||||||
|
@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
|
||||||
col_group = gpc.get_group(col_parallel_mode)
|
col_group = gpc.get_group(col_parallel_mode)
|
||||||
|
|
||||||
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||||
pipeline_parallel_rank * tensor_parallel_size
|
pipeline_parallel_rank * tensor_parallel_size
|
||||||
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||||
pipeline_parallel_rank * tensor_parallel_size
|
pipeline_parallel_rank * tensor_parallel_size
|
||||||
|
|
||||||
opa = [None] * 2
|
opa = [None] * 2
|
||||||
opb = [None] * 2
|
opb = [None] * 2
|
||||||
|
@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
|
||||||
col_group = gpc.get_group(col_parallel_mode)
|
col_group = gpc.get_group(col_parallel_mode)
|
||||||
|
|
||||||
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||||
pipeline_parallel_rank * tensor_parallel_size
|
pipeline_parallel_rank * tensor_parallel_size
|
||||||
src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||||
pipeline_parallel_rank * tensor_parallel_size
|
pipeline_parallel_rank * tensor_parallel_size
|
||||||
|
|
||||||
opb = [None] * 2
|
opb = [None] * 2
|
||||||
opr = [None] * 2
|
opr = [None] * 2
|
||||||
|
@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
|
||||||
col_group = gpc.get_group(col_parallel_mode)
|
col_group = gpc.get_group(col_parallel_mode)
|
||||||
|
|
||||||
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||||
pipeline_parallel_rank * tensor_parallel_size
|
pipeline_parallel_rank * tensor_parallel_size
|
||||||
src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
|
||||||
pipeline_parallel_rank * tensor_parallel_size
|
pipeline_parallel_rank * tensor_parallel_size
|
||||||
|
|
||||||
opa = [None] * 2
|
opa = [None] * 2
|
||||||
opr = [None] * 2
|
opr = [None] * 2
|
|
@ -8,10 +8,10 @@ import torch.nn.functional as F
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.nn import Parameter
|
from torch.nn import Parameter
|
||||||
|
|
||||||
from colossalai.communication import broadcast
|
|
||||||
from colossalai.context import ParallelMode, seed
|
from colossalai.context import ParallelMode, seed
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
|
from colossalai.legacy.communication import broadcast
|
||||||
from colossalai.legacy.registry import LAYERS
|
from colossalai.legacy.registry import LAYERS
|
||||||
from colossalai.nn import init as init
|
from colossalai.nn import init as init
|
||||||
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
|
from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict
|
|
@ -1,6 +1,13 @@
|
||||||
from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
|
from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
|
||||||
from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D,
|
from .layers import (
|
||||||
VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D)
|
Classifier2p5D,
|
||||||
|
Embedding2p5D,
|
||||||
|
LayerNorm2p5D,
|
||||||
|
Linear2p5D,
|
||||||
|
PatchEmbedding2p5D,
|
||||||
|
VocabParallelClassifier2p5D,
|
||||||
|
VocabParallelEmbedding2p5D,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
|
'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
|
|
@ -2,13 +2,14 @@ from typing import Any, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
|
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
|
||||||
from colossalai.core import global_context as gpc
|
|
||||||
from colossalai.utils import get_current_device
|
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||||
|
|
||||||
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter
|
||||||
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
|
|
||||||
def get_parallel_group(parallel_mode: ParallelMode):
|
def get_parallel_group(parallel_mode: ParallelMode):
|
||||||
return gpc.get_group(parallel_mode)
|
return gpc.get_group(parallel_mode)
|
|
@ -8,10 +8,10 @@ import torch.nn.functional as F
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.nn import Parameter
|
from torch.nn import Parameter
|
||||||
|
|
||||||
from colossalai.communication import broadcast
|
|
||||||
from colossalai.context import ParallelMode, seed
|
from colossalai.context import ParallelMode, seed
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
|
from colossalai.legacy.communication import broadcast
|
||||||
from colossalai.legacy.registry import LAYERS
|
from colossalai.legacy.registry import LAYERS
|
||||||
from colossalai.nn import init as init
|
from colossalai.nn import init as init
|
||||||
from colossalai.utils.checkpointing import (
|
from colossalai.utils.checkpointing import (
|
|
@ -1,6 +1,13 @@
|
||||||
from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
|
from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d
|
||||||
from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D,
|
from .layers import (
|
||||||
VocabParallelEmbedding3D)
|
Classifier3D,
|
||||||
|
Embedding3D,
|
||||||
|
LayerNorm3D,
|
||||||
|
Linear3D,
|
||||||
|
PatchEmbedding3D,
|
||||||
|
VocabParallelClassifier3D,
|
||||||
|
VocabParallelEmbedding3D,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
|
'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D',
|
|
@ -7,10 +7,10 @@ import torch
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||||
|
|
||||||
from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
|
|
||||||
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter
|
||||||
|
|
||||||
from ._utils import get_parallel_mode_from_env, push_async_grad
|
from ._utils import get_parallel_mode_from_env, push_async_grad
|
||||||
|
|
|
@ -8,14 +8,14 @@ import torch.nn.functional as F
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.nn import Parameter
|
from torch.nn import Parameter
|
||||||
|
|
||||||
from colossalai.communication import all_reduce, broadcast
|
|
||||||
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
|
from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
|
||||||
from colossalai.context import ParallelMode, seed
|
from colossalai.context import ParallelMode, seed
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
|
from colossalai.legacy.communication import all_reduce, broadcast
|
||||||
|
from colossalai.legacy.nn.layer.base_layer import ParallelLayer
|
||||||
from colossalai.legacy.registry import LAYERS
|
from colossalai.legacy.registry import LAYERS
|
||||||
from colossalai.nn import init as init
|
from colossalai.nn import init as init
|
||||||
from colossalai.nn.layer.base_layer import ParallelLayer
|
|
||||||
from colossalai.utils.checkpointing import (
|
from colossalai.utils.checkpointing import (
|
||||||
broadcast_state_dict,
|
broadcast_state_dict,
|
||||||
gather_tensor_parallel_state_dict,
|
gather_tensor_parallel_state_dict,
|
|
@ -1,4 +1,4 @@
|
||||||
from ._operation import RingQK, RingAV
|
from ._operation import RingAV, RingQK
|
||||||
from .layers import TransformerSelfAttentionRing
|
from .layers import TransformerSelfAttentionRing
|
||||||
|
|
||||||
__all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
|
__all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK']
|
|
@ -3,13 +3,13 @@
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import distributed as dist
|
from torch import distributed as dist
|
||||||
|
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||||
|
|
||||||
from colossalai.communication import ring_forward
|
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range
|
from colossalai.legacy.communication import ring_forward
|
||||||
|
from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
|
||||||
|
|
||||||
|
|
||||||
class RingQK(torch.autograd.Function):
|
class RingQK(torch.autograd.Function):
|
|
@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
from colossalai.kernel import FusedScaleMaskSoftmax
|
from colossalai.kernel import FusedScaleMaskSoftmax
|
||||||
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
|
from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType
|
||||||
|
from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK
|
||||||
from colossalai.legacy.registry import LAYERS
|
from colossalai.legacy.registry import LAYERS
|
||||||
from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK
|
|
||||||
|
|
||||||
|
|
||||||
@LAYERS.register_module
|
@LAYERS.register_module
|
|
@ -0,0 +1,15 @@
|
||||||
|
from .common import (
|
||||||
|
ACT2FN,
|
||||||
|
CheckpointModule,
|
||||||
|
_ntuple,
|
||||||
|
divide,
|
||||||
|
get_tensor_parallel_mode,
|
||||||
|
set_tensor_parallel_attribute_by_partition,
|
||||||
|
set_tensor_parallel_attribute_by_size,
|
||||||
|
to_2tuple,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
|
||||||
|
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
|
||||||
|
]
|
|
@ -6,10 +6,11 @@ from itertools import repeat
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from torch import Tensor, nn
|
||||||
|
|
||||||
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
|
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
from colossalai.utils import checkpoint
|
from colossalai.utils import checkpoint
|
||||||
from torch import Tensor, nn
|
|
||||||
|
|
||||||
|
|
||||||
class CheckpointModule(nn.Module):
|
class CheckpointModule(nn.Module):
|
|
@ -1,6 +1,8 @@
|
||||||
import torch.nn as nn
|
|
||||||
import torch.distributed as dist
|
|
||||||
from typing import List, Tuple, Union
|
from typing import List, Tuple, Union
|
||||||
|
|
||||||
|
import torch.distributed as dist
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn.modules.loss import *
|
||||||
|
from torch.nn.modules.loss import _Loss
|
||||||
|
|
||||||
|
from colossalai.global_variables import tensor_parallel_env as env
|
||||||
|
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
|
||||||
|
|
||||||
|
from .loss_1d import VocabParallelCrossEntropyLoss1D
|
||||||
|
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
|
||||||
|
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
|
||||||
|
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
|
||||||
|
|
||||||
|
_parallel_cross_entropy = {
|
||||||
|
'2d': CrossEntropyLoss2D,
|
||||||
|
'2.5d': CrossEntropyLoss2p5D,
|
||||||
|
'3d': CrossEntropyLoss3D,
|
||||||
|
}
|
||||||
|
|
||||||
|
_vocab_parallel_cross_entropy = {
|
||||||
|
'1d': VocabParallelCrossEntropyLoss1D,
|
||||||
|
'2d': VocabParallelCrossEntropyLoss2D,
|
||||||
|
'2.5d': VocabParallelCrossEntropyLoss2p5D,
|
||||||
|
'3d': VocabParallelCrossEntropyLoss3D,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CrossEntropyLoss(_Loss):
|
||||||
|
|
||||||
|
def __init__(self, reduction: bool = True, *args, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
tensor_parallel = get_tensor_parallel_mode()
|
||||||
|
if tensor_parallel is not None and env.vocab_parallel:
|
||||||
|
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
||||||
|
elif tensor_parallel is None or tensor_parallel == '1d':
|
||||||
|
reduction = 'mean' if reduction else 'none'
|
||||||
|
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
|
||||||
|
else:
|
||||||
|
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
||||||
|
|
||||||
|
def forward(self, *args):
|
||||||
|
return self.loss(*args)
|
|
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
|
||||||
|
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
||||||
|
from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization
|
||||||
from colossalai.legacy.registry import LOSSES
|
from colossalai.legacy.registry import LOSSES
|
||||||
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
|
||||||
from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
|
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
|
||||||
|
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
||||||
|
from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
|
||||||
from colossalai.legacy.registry import LOSSES
|
from colossalai.legacy.registry import LOSSES
|
||||||
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
|
||||||
from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
|
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss
|
||||||
|
|
||||||
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
|
from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
||||||
|
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
||||||
from colossalai.legacy.registry import LOSSES
|
from colossalai.legacy.registry import LOSSES
|
||||||
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
|
||||||
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
|
||||||
from colossalai.utils import get_current_device
|
from colossalai.utils import get_current_device
|
||||||
|
|
||||||
|
|
|
@ -1,26 +1,28 @@
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from ._utils import calc_acc
|
from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode
|
||||||
from .accuracy_2d import Accuracy2D
|
|
||||||
from .accuracy_2p5d import Accuracy2p5D
|
from ._utils import calc_acc
|
||||||
from .accuracy_3d import Accuracy3D
|
from .accuracy_2d import Accuracy2D
|
||||||
from colossalai.nn.layer.utils import get_tensor_parallel_mode
|
from .accuracy_2p5d import Accuracy2p5D
|
||||||
|
from .accuracy_3d import Accuracy3D
|
||||||
_parallel_accuracy = {
|
|
||||||
'2d': Accuracy2D,
|
_parallel_accuracy = {
|
||||||
'2.5d': Accuracy2p5D,
|
'2d': Accuracy2D,
|
||||||
'3d': Accuracy3D,
|
'2.5d': Accuracy2p5D,
|
||||||
}
|
'3d': Accuracy3D,
|
||||||
|
}
|
||||||
|
|
||||||
class Accuracy(nn.Module):
|
|
||||||
def __init__(self):
|
class Accuracy(nn.Module):
|
||||||
super().__init__()
|
|
||||||
tensor_parallel = get_tensor_parallel_mode()
|
def __init__(self):
|
||||||
if tensor_parallel not in _parallel_accuracy:
|
super().__init__()
|
||||||
self.acc = calc_acc
|
tensor_parallel = get_tensor_parallel_mode()
|
||||||
else:
|
if tensor_parallel not in _parallel_accuracy:
|
||||||
self.acc = _parallel_accuracy[tensor_parallel]()
|
self.acc = calc_acc
|
||||||
|
else:
|
||||||
def forward(self, *args):
|
self.acc = _parallel_accuracy[tensor_parallel]()
|
||||||
return self.acc(*args)
|
|
||||||
|
def forward(self, *args):
|
||||||
|
return self.acc(*args)
|
|
@ -1,7 +1,7 @@
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
def calc_acc(logits, targets):
|
def calc_acc(logits, targets):
|
||||||
preds = torch.argmax(logits, dim=-1)
|
preds = torch.argmax(logits, dim=-1)
|
||||||
correct = torch.sum(targets == preds)
|
correct = torch.sum(targets == preds)
|
||||||
return correct
|
return correct
|
|
@ -1,7 +1,8 @@
|
||||||
import torch
|
import torch
|
||||||
from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
|
||||||
|
|
||||||
from ._utils import calc_acc
|
from ._utils import calc_acc
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import torch
|
import torch
|
||||||
from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
|
||||||
|
|
||||||
from ._utils import calc_acc
|
from ._utils import calc_acc
|
||||||
|
|
||||||
|
|
|
@ -1,33 +1,35 @@
|
||||||
import torch
|
import torch
|
||||||
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
from torch import nn
|
||||||
from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
|
||||||
from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
|
||||||
from torch import nn
|
from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d
|
||||||
|
from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
|
||||||
from ._utils import calc_acc
|
|
||||||
|
from ._utils import calc_acc
|
||||||
|
|
||||||
class Accuracy3D(nn.Module):
|
|
||||||
"""Accuracy for 3D parallelism
|
class Accuracy3D(nn.Module):
|
||||||
"""
|
"""Accuracy for 3D parallelism
|
||||||
def __init__(self):
|
"""
|
||||||
super().__init__()
|
|
||||||
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
|
def __init__(self):
|
||||||
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
|
super().__init__()
|
||||||
|
self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
|
||||||
def forward(self, logits, targets):
|
self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
|
||||||
"""Calculate the accuracy of predicted labels.
|
|
||||||
|
def forward(self, logits, targets):
|
||||||
Args:
|
"""Calculate the accuracy of predicted labels.
|
||||||
logits (:class:`torch.tensor`): Predicted labels.
|
|
||||||
targets (:class:`torch.tensor`): True labels from data.
|
Args:
|
||||||
|
logits (:class:`torch.tensor`): Predicted labels.
|
||||||
Returns:
|
targets (:class:`torch.tensor`): True labels from data.
|
||||||
float: the accuracy of prediction.
|
|
||||||
"""
|
Returns:
|
||||||
with torch.no_grad():
|
float: the accuracy of prediction.
|
||||||
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
"""
|
||||||
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
|
with torch.no_grad():
|
||||||
correct = calc_acc(logits, targets)
|
targets = split_tensor_3d(targets, 0, self.weight_parallel_mode)
|
||||||
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
|
targets = split_tensor_3d(targets, 0, self.input_parallel_mode)
|
||||||
return correct
|
correct = calc_acc(logits, targets)
|
||||||
|
correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode)
|
||||||
|
return correct
|
|
@ -1,10 +1,17 @@
|
||||||
|
from .cache_embedding import (
|
||||||
|
CachedEmbeddingBag,
|
||||||
|
CachedParamMgr,
|
||||||
|
EvictionStrategy,
|
||||||
|
LimitBuffIndexCopyer,
|
||||||
|
ParallelCachedEmbeddingBag,
|
||||||
|
ParallelCachedEmbeddingBagTablewise,
|
||||||
|
ParallelCachedEmbeddingBagTablewiseSpiltCache,
|
||||||
|
TablewiseEmbeddingBagConfig,
|
||||||
|
)
|
||||||
from .colo_module import ColoModule
|
from .colo_module import ColoModule
|
||||||
from .linear import ColoLinear
|
|
||||||
from .embedding import ColoEmbedding
|
from .embedding import ColoEmbedding
|
||||||
from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module
|
from .linear import ColoLinear
|
||||||
|
from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module
|
||||||
from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \
|
|
||||||
ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
|
'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module',
|
|
@ -1,8 +1,8 @@
|
||||||
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
||||||
from .copyer import LimitBuffIndexCopyer
|
|
||||||
from .cached_embedding import CachedEmbeddingBag
|
from .cached_embedding import CachedEmbeddingBag
|
||||||
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
|
from .copyer import LimitBuffIndexCopyer
|
||||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||||
|
from .parallel_cached_embedding import ParallelCachedEmbeddingBag
|
||||||
from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
|
from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise
|
||||||
from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
|
from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import abc
|
import abc
|
||||||
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torch.profiler import record_function
|
|
||||||
from typing import List, Optional
|
|
||||||
from contexttimer import Timer
|
|
||||||
from .copyer import LimitBuffIndexCopyer
|
|
||||||
from enum import Enum
|
|
||||||
import sys
|
import sys
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from contexttimer import Timer
|
||||||
|
from torch.profiler import record_function
|
||||||
|
|
||||||
|
from .copyer import LimitBuffIndexCopyer
|
||||||
|
|
||||||
|
|
||||||
class EvictionStrategy(Enum):
|
class EvictionStrategy(Enum):
|
||||||
|
@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None:
|
||||||
class CachedParamMgr(torch.nn.Module):
|
class CachedParamMgr(torch.nn.Module):
|
||||||
"""
|
"""
|
||||||
Manage Embedding Weights on CPU and CUDA memory uses a software cache.
|
Manage Embedding Weights on CPU and CUDA memory uses a software cache.
|
||||||
CPU maintains the entire original weight.
|
CPU maintains the entire original weight.
|
||||||
CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
|
CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`.
|
||||||
During training, GPU needs to transmit embedding rows between CPU and GPU.
|
During training, GPU needs to transmit embedding rows between CPU and GPU.
|
||||||
Args:
|
Args:
|
||||||
|
@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module):
|
||||||
self._elapsed_dict[name] += t.elapsed
|
self._elapsed_dict[name] += t.elapsed
|
||||||
|
|
||||||
def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
|
def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor:
|
||||||
"""_find_evict_gpu_idxs
|
"""_find_evict_gpu_idxs
|
||||||
Find the gpu idxs to be evicted, according to their freq.
|
Find the gpu idxs to be evicted, according to their freq.
|
||||||
Args:
|
Args:
|
||||||
evict_num (int): how many rows has to be evicted
|
evict_num (int): how many rows has to be evicted
|
||||||
|
@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module):
|
||||||
"""reorder
|
"""reorder
|
||||||
reorder the weight according to ids' frequency in dataset before training.
|
reorder the weight according to ids' frequency in dataset before training.
|
||||||
Execute only once before training, also known as warmup phase.
|
Execute only once before training, also known as warmup phase.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
If you would like to use the DATASET as the eviction strategy, you must call this function.
|
If you would like to use the DATASET as the eviction strategy, you must call this function.
|
||||||
Note:
|
Note:
|
||||||
|
@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module):
|
||||||
"""
|
"""
|
||||||
deprecated
|
deprecated
|
||||||
evict one row from cuda to cpu.
|
evict one row from cuda to cpu.
|
||||||
Returns:
|
Returns:
|
||||||
(int) : the slot id be evicted.
|
(int) : the slot id be evicted.
|
||||||
"""
|
"""
|
||||||
mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
|
mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1)
|
|
@ -1,10 +1,11 @@
|
||||||
|
from typing import Iterator, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from typing import List, Optional, Iterator, Tuple, Union
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from .base_embedding import BaseEmbeddingBag
|
from .base_embedding import BaseEmbeddingBag
|
||||||
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
||||||
from torch.nn.parameter import Parameter
|
|
||||||
|
|
||||||
|
|
||||||
class CachedEmbeddingBag(BaseEmbeddingBag):
|
class CachedEmbeddingBag(BaseEmbeddingBag):
|
||||||
|
@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
||||||
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
||||||
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
||||||
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
||||||
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
||||||
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
|
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
|
||||||
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
||||||
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
|
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
|
||||||
|
@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
||||||
buffer_size=50_000,
|
buffer_size=50_000,
|
||||||
pin_weight=False):
|
pin_weight=False):
|
||||||
"""
|
"""
|
||||||
Called after initialized.
|
Called after initialized.
|
||||||
Reorder the weight rows according to the ids_freq_mapping.
|
Reorder the weight rows according to the ids_freq_mapping.
|
||||||
Then, let the weights of the Module be managed by a CachedParamMgr.
|
Then, let the weights of the Module be managed by a CachedParamMgr.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cuda_row_num (int): number of rows can be hosted in CUDA memory
|
cuda_row_num (int): number of rows can be hosted in CUDA memory
|
||||||
ids_freq_mapping (List[int]): a list, idx is id number, value is freq
|
ids_freq_mapping (List[int]): a list, idx is id number, value is freq
|
|
@ -3,7 +3,7 @@ from torch import LongTensor
|
||||||
|
|
||||||
|
|
||||||
class LimitBuffIndexCopyer(object):
|
class LimitBuffIndexCopyer(object):
|
||||||
"""LimitBuffIndexCopyer
|
"""LimitBuffIndexCopyer
|
||||||
Index Copy using limited temp buffer on CUDA.
|
Index Copy using limited temp buffer on CUDA.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object):
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
|
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
|
||||||
"""copy
|
"""copy
|
||||||
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
|
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
|
||||||
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
|
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
|
from typing import Iterator, List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from typing import List, Optional, Iterator, Tuple
|
|
||||||
|
|
||||||
from .cached_embedding import CachedEmbeddingBag
|
from colossalai.legacy.nn._ops._utils import dual_all_to_all
|
||||||
from colossalai.nn._ops._utils import dual_all_to_all
|
from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec
|
||||||
|
|
||||||
from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor
|
|
||||||
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
from .cache_mgr import CachedParamMgr, EvictionStrategy
|
||||||
|
from .cached_embedding import CachedEmbeddingBag
|
||||||
|
|
||||||
|
|
||||||
def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
|
def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]:
|
|
@ -1,15 +1,16 @@
|
||||||
|
import time
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from .cached_embedding import CachedEmbeddingBag
|
from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
|
||||||
from .cache_mgr import EvictionStrategy
|
|
||||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
|
||||||
from colossalai.tensor import ProcessGroup
|
from colossalai.tensor import ProcessGroup
|
||||||
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
|
|
||||||
|
|
||||||
from typing import List
|
from .cache_mgr import EvictionStrategy
|
||||||
import time
|
from .cached_embedding import CachedEmbeddingBag
|
||||||
|
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||||
|
|
||||||
|
|
||||||
class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
|
class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag):
|
|
@ -1,17 +1,17 @@
|
||||||
|
import abc
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch.profiler import record_function
|
from torch.profiler import record_function
|
||||||
|
|
||||||
from .cached_embedding import CachedEmbeddingBag
|
from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise
|
||||||
|
|
||||||
from colossalai.tensor import ProcessGroup
|
from colossalai.tensor import ProcessGroup
|
||||||
from colossalai.nn._ops._utils import dual_all_to_all_tablewise
|
|
||||||
from .embedding_config import TablewiseEmbeddingBagConfig
|
|
||||||
from .cache_mgr import EvictionStrategy
|
|
||||||
|
|
||||||
from typing import List
|
from .cache_mgr import EvictionStrategy
|
||||||
import abc
|
from .cached_embedding import CachedEmbeddingBag
|
||||||
|
from .embedding_config import TablewiseEmbeddingBagConfig
|
||||||
|
|
||||||
|
|
||||||
class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
|
class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
|
|
@ -1,6 +1,7 @@
|
||||||
from colossalai.tensor.distspec import _DistSpec
|
from typing import Dict, List
|
||||||
|
|
||||||
from colossalai.tensor import ComputePattern
|
from colossalai.tensor import ComputePattern
|
||||||
from typing import List, Dict
|
from colossalai.tensor.distspec import _DistSpec
|
||||||
|
|
||||||
|
|
||||||
class ColoModule(object):
|
class ColoModule(object):
|
|
@ -1,5 +1,6 @@
|
||||||
|
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
|
||||||
|
|
||||||
from .colo_module import ColoModule
|
from .colo_module import ColoModule
|
||||||
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
|
|
||||||
|
|
||||||
|
|
||||||
class ColoEmbedding(ColoModule):
|
class ColoEmbedding(ColoModule):
|
|
@ -1,5 +1,6 @@
|
||||||
|
from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec
|
||||||
|
|
||||||
from .colo_module import ColoModule
|
from .colo_module import ColoModule
|
||||||
from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec
|
|
||||||
|
|
||||||
|
|
||||||
class ColoLinear(ColoModule):
|
class ColoLinear(ColoModule):
|
|
@ -1,9 +1,11 @@
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup
|
|
||||||
from colossalai.tensor import distspec
|
|
||||||
from . import ColoModule
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec
|
||||||
|
|
||||||
|
from . import ColoModule
|
||||||
|
|
||||||
_COLOSSAL_MODULES: Dict[type, ColoModule] = {}
|
_COLOSSAL_MODULES: Dict[type, ColoModule] = {}
|
||||||
|
|
||||||
|
|
|
@ -7,9 +7,9 @@ from typing import Callable
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
from colossalai.communication import all_reduce
|
|
||||||
from colossalai.context import ParallelMode
|
from colossalai.context import ParallelMode
|
||||||
from colossalai.core import global_context as gpc
|
from colossalai.core import global_context as gpc
|
||||||
|
from colossalai.legacy.communication import all_reduce
|
||||||
from colossalai.legacy.registry import HOOKS
|
from colossalai.legacy.registry import HOOKS
|
||||||
from colossalai.utils import get_current_device, is_no_pp_or_last_stage
|
from colossalai.utils import get_current_device, is_no_pp_or_last_stage
|
||||||
|
|
||||||
|
|
|
@ -6,8 +6,7 @@ import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
import colossalai
|
import torch.distributed as dist
|
||||||
from colossalai.context.parallel_mode import ParallelMode
|
|
||||||
|
|
||||||
|
|
||||||
class DistributedLogger:
|
class DistributedLogger:
|
||||||
|
@ -63,6 +62,7 @@ class DistributedLogger:
|
||||||
self._logger.propagate = False
|
self._logger.propagate = False
|
||||||
|
|
||||||
DistributedLogger.__instances[name] = self
|
DistributedLogger.__instances[name] = self
|
||||||
|
self.rank = dist.get_rank() if dist.is_initialized() else 0
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __get_call_info():
|
def __get_call_info():
|
||||||
|
@ -109,16 +109,10 @@ class DistributedLogger:
|
||||||
# create log directory
|
# create log directory
|
||||||
path.mkdir(parents=True, exist_ok=True)
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# set the default file name if path is a directory
|
|
||||||
if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL):
|
|
||||||
rank = 0
|
|
||||||
else:
|
|
||||||
rank = colossalai.core.global_context.get_global_rank()
|
|
||||||
|
|
||||||
if suffix is not None:
|
if suffix is not None:
|
||||||
log_file_name = f'rank_{rank}_{suffix}.log'
|
log_file_name = f'rank_{self.rank}_{suffix}.log'
|
||||||
else:
|
else:
|
||||||
log_file_name = f'rank_{rank}.log'
|
log_file_name = f'rank_{self.rank}.log'
|
||||||
path = path.joinpath(log_file_name)
|
path = path.joinpath(log_file_name)
|
||||||
|
|
||||||
# add file handler
|
# add file handler
|
||||||
|
@ -128,19 +122,14 @@ class DistributedLogger:
|
||||||
file_handler.setFormatter(formatter)
|
file_handler.setFormatter(formatter)
|
||||||
self._logger.addHandler(file_handler)
|
self._logger.addHandler(file_handler)
|
||||||
|
|
||||||
def _log(self,
|
def _log(self, level, message: str, ranks: List[int] = None) -> None:
|
||||||
level,
|
|
||||||
message: str,
|
|
||||||
parallel_mode: ParallelMode = ParallelMode.GLOBAL,
|
|
||||||
ranks: List[int] = None) -> None:
|
|
||||||
if ranks is None:
|
if ranks is None:
|
||||||
getattr(self._logger, level)(message)
|
getattr(self._logger, level)(message)
|
||||||
else:
|
else:
|
||||||
local_rank = colossalai.core.global_context.get_local_rank(parallel_mode)
|
if self.rank in ranks:
|
||||||
if local_rank in ranks:
|
|
||||||
getattr(self._logger, level)(message)
|
getattr(self._logger, level)(message)
|
||||||
|
|
||||||
def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
def info(self, message: str, ranks: List[int] = None) -> None:
|
||||||
"""Log an info message.
|
"""Log an info message.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -150,10 +139,10 @@ class DistributedLogger:
|
||||||
ranks (List[int]): List of parallel ranks.
|
ranks (List[int]): List of parallel ranks.
|
||||||
"""
|
"""
|
||||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||||
self._log('info', message_prefix, parallel_mode, ranks)
|
self._log('info', message_prefix, ranks)
|
||||||
self._log('info', message, parallel_mode, ranks)
|
self._log('info', message, ranks)
|
||||||
|
|
||||||
def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
def warning(self, message: str, ranks: List[int] = None) -> None:
|
||||||
"""Log a warning message.
|
"""Log a warning message.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -163,10 +152,10 @@ class DistributedLogger:
|
||||||
ranks (List[int]): List of parallel ranks.
|
ranks (List[int]): List of parallel ranks.
|
||||||
"""
|
"""
|
||||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||||
self._log('warning', message_prefix, parallel_mode, ranks)
|
self._log('warning', message_prefix, ranks)
|
||||||
self._log('warning', message, parallel_mode, ranks)
|
self._log('warning', message, ranks)
|
||||||
|
|
||||||
def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
def debug(self, message: str, ranks: List[int] = None) -> None:
|
||||||
"""Log a debug message.
|
"""Log a debug message.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -176,10 +165,10 @@ class DistributedLogger:
|
||||||
ranks (List[int]): List of parallel ranks.
|
ranks (List[int]): List of parallel ranks.
|
||||||
"""
|
"""
|
||||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||||
self._log('debug', message_prefix, parallel_mode, ranks)
|
self._log('debug', message_prefix, ranks)
|
||||||
self._log('debug', message, parallel_mode, ranks)
|
self._log('debug', message, ranks)
|
||||||
|
|
||||||
def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None:
|
def error(self, message: str, ranks: List[int] = None) -> None:
|
||||||
"""Log an error message.
|
"""Log an error message.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -189,5 +178,5 @@ class DistributedLogger:
|
||||||
ranks (List[int]): List of parallel ranks.
|
ranks (List[int]): List of parallel ranks.
|
||||||
"""
|
"""
|
||||||
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
message_prefix = "{}:{} {}".format(*self.__get_call_info())
|
||||||
self._log('error', message_prefix, parallel_mode, ranks)
|
self._log('error', message_prefix, ranks)
|
||||||
self._log('error', message, parallel_mode, ranks)
|
self._log('error', message, ranks)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ._ops import *
|
from .init import *
|
||||||
from .layer import *
|
from .layer import *
|
||||||
from .loss import *
|
from .loss import *
|
||||||
from .lr_scheduler import *
|
from .lr_scheduler import *
|
||||||
from .metric import *
|
|
||||||
from .optimizer import *
|
from .optimizer import *
|
||||||
|
|
|
@ -1,10 +1,2 @@
|
||||||
from .colossalai_layer import *
|
|
||||||
from .parallel_1d import *
|
|
||||||
from .parallel_2d import *
|
|
||||||
from .parallel_2p5d import *
|
|
||||||
from .parallel_3d import *
|
|
||||||
from .parallel_sequence import *
|
|
||||||
from .moe import *
|
from .moe import *
|
||||||
from .utils import *
|
from .utils import *
|
||||||
from .vanilla import *
|
|
||||||
from .wrapper import *
|
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
|
|
||||||
PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
|
|
||||||
'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D'
|
|
||||||
]
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
def divide(numerator, denominator):
|
||||||
|
"""Only allow exact division.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
numerator (int): Numerator of the division.
|
||||||
|
denominator (int): Denominator of the division.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: the result of exact division.
|
||||||
|
"""
|
||||||
|
assert denominator != 0, 'denominator can not be zero'
|
||||||
|
assert numerator % denominator == 0, \
|
||||||
|
'{} is not divisible by {}'.format(numerator, denominator)
|
||||||
|
return numerator // denominator
|
|
@ -1,7 +0,0 @@
|
||||||
from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
|
|
||||||
set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
|
|
||||||
'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
|
|
||||||
]
|
|
|
@ -1,41 +1 @@
|
||||||
from colossalai.global_variables import tensor_parallel_env as env
|
|
||||||
from colossalai.nn.layer.utils import get_tensor_parallel_mode
|
|
||||||
from torch import nn
|
|
||||||
from torch.nn.modules.loss import *
|
|
||||||
from torch.nn.modules.loss import _Loss
|
|
||||||
|
|
||||||
from .loss_1d import VocabParallelCrossEntropyLoss1D
|
|
||||||
from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D
|
|
||||||
from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D
|
|
||||||
from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D
|
|
||||||
from .loss_moe import MoeCrossEntropyLoss, MoeLoss
|
from .loss_moe import MoeCrossEntropyLoss, MoeLoss
|
||||||
|
|
||||||
_parallel_cross_entropy = {
|
|
||||||
'2d': CrossEntropyLoss2D,
|
|
||||||
'2.5d': CrossEntropyLoss2p5D,
|
|
||||||
'3d': CrossEntropyLoss3D,
|
|
||||||
}
|
|
||||||
|
|
||||||
_vocab_parallel_cross_entropy = {
|
|
||||||
'1d': VocabParallelCrossEntropyLoss1D,
|
|
||||||
'2d': VocabParallelCrossEntropyLoss2D,
|
|
||||||
'2.5d': VocabParallelCrossEntropyLoss2p5D,
|
|
||||||
'3d': VocabParallelCrossEntropyLoss3D,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class CrossEntropyLoss(_Loss):
|
|
||||||
|
|
||||||
def __init__(self, reduction: bool = True, *args, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
tensor_parallel = get_tensor_parallel_mode()
|
|
||||||
if tensor_parallel is not None and env.vocab_parallel:
|
|
||||||
self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
|
||||||
elif tensor_parallel is None or tensor_parallel == '1d':
|
|
||||||
reduction = 'mean' if reduction else 'none'
|
|
||||||
self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
|
|
||||||
else:
|
|
||||||
self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
|
|
||||||
|
|
||||||
def forward(self, *args):
|
|
||||||
return self.loss(*args)
|
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
|
from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR
|
||||||
|
|
||||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
|
||||||
|
|
||||||
from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
|
from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class CosineAnnealingLR(_CosineAnnealingLR):
|
class CosineAnnealingLR(_CosineAnnealingLR):
|
||||||
r"""Set the learning rate of each parameter group using a cosine annealing
|
r"""Set the learning rate of each parameter group using a cosine annealing
|
||||||
schedule, where :math:`\eta_{max}` is set to the initial lr and
|
schedule, where :math:`\eta_{max}` is set to the initial lr and
|
||||||
|
@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR):
|
||||||
super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
|
super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class CosineAnnealingWarmupLR(WarmupScheduler):
|
class CosineAnnealingWarmupLR(WarmupScheduler):
|
||||||
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
|
"""Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied.
|
||||||
|
|
||||||
|
@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
|
||||||
super().__init__(optimizer, warmup_steps, base_scheduler)
|
super().__init__(optimizer, warmup_steps, base_scheduler)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class FlatAnnealingLR(DelayerScheduler):
|
class FlatAnnealingLR(DelayerScheduler):
|
||||||
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
|
"""Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay.
|
||||||
|
|
||||||
|
@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler):
|
||||||
super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
|
super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
|
class FlatAnnealingWarmupLR(WarmupDelayerScheduler):
|
||||||
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
|
"""Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be
|
||||||
applied, and then the learning rate will be a fixed value before starting decay.
|
applied, and then the learning rate will be a fixed value before starting decay.
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
from torch.optim.lr_scheduler import _LRScheduler
|
from torch.optim.lr_scheduler import _LRScheduler
|
||||||
|
|
||||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class LinearWarmupLR(_LRScheduler):
|
class LinearWarmupLR(_LRScheduler):
|
||||||
"""Linearly warmup learning rate and then linearly decay.
|
"""Linearly warmup learning rate and then linearly decay.
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,9 @@ from typing import List
|
||||||
|
|
||||||
from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
|
from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR
|
||||||
|
|
||||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
|
||||||
|
|
||||||
from .delayed import WarmupScheduler
|
from .delayed import WarmupScheduler
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class MultiStepLR(_MultiStepLR):
|
class MultiStepLR(_MultiStepLR):
|
||||||
"""Decays the learning rate of each parameter group by gamma once the
|
"""Decays the learning rate of each parameter group by gamma once the
|
||||||
number of epoch reaches one of the milestones. Notice that such decay can
|
number of epoch reaches one of the milestones. Notice that such decay can
|
||||||
|
@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR):
|
||||||
super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
|
super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class MultiStepWarmupLR(WarmupScheduler):
|
class MultiStepWarmupLR(WarmupScheduler):
|
||||||
"""Multistep learning rate scheduler with warmup.
|
"""Multistep learning rate scheduler with warmup.
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
|
from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR
|
||||||
|
|
||||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class OneCycleLR(_OneCycleLR):
|
class OneCycleLR(_OneCycleLR):
|
||||||
r"""Sets the learning rate of each parameter group according to the
|
r"""Sets the learning rate of each parameter group according to the
|
||||||
1cycle learning rate policy. The 1cycle policy anneals the learning
|
1cycle learning rate policy. The 1cycle policy anneals the learning
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
from torch.optim.lr_scheduler import _LRScheduler
|
from torch.optim.lr_scheduler import _LRScheduler
|
||||||
|
|
||||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
|
||||||
|
|
||||||
from .delayed import WarmupScheduler
|
from .delayed import WarmupScheduler
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class PolynomialLR(_LRScheduler):
|
class PolynomialLR(_LRScheduler):
|
||||||
"""Polynomial learning rate scheduler.
|
"""Polynomial learning rate scheduler.
|
||||||
|
|
||||||
|
@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler):
|
||||||
for base_lr in self.base_lrs]
|
for base_lr in self.base_lrs]
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class PolynomialWarmupLR(WarmupScheduler):
|
class PolynomialWarmupLR(WarmupScheduler):
|
||||||
"""Polynomial learning rate scheduler with warmup.
|
"""Polynomial learning rate scheduler with warmup.
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
|
||||||
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
|
from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
|
||||||
from torch.optim.lr_scheduler import StepLR as _StepLR
|
from torch.optim.lr_scheduler import StepLR as _StepLR
|
||||||
|
|
||||||
from colossalai.legacy.registry import LR_SCHEDULERS
|
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class LambdaLR(_LambdaLR):
|
class LambdaLR(_LambdaLR):
|
||||||
"""Sets the learning rate of each parameter group to the initial lr
|
"""Sets the learning rate of each parameter group to the initial lr
|
||||||
times a given function. When last_epoch=-1, sets initial lr as lr.
|
times a given function. When last_epoch=-1, sets initial lr as lr.
|
||||||
|
@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR):
|
||||||
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class MultiplicativeLR(_MultiplicativeLR):
|
class MultiplicativeLR(_MultiplicativeLR):
|
||||||
"""Multiply the learning rate of each parameter group by the factor given
|
"""Multiply the learning rate of each parameter group by the factor given
|
||||||
in the specified function. When last_epoch=-1, sets initial lr as lr.
|
in the specified function. When last_epoch=-1, sets initial lr as lr.
|
||||||
|
@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR):
|
||||||
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class StepLR(_StepLR):
|
class StepLR(_StepLR):
|
||||||
"""Decays the learning rate of each parameter group by gamma every
|
"""Decays the learning rate of each parameter group by gamma every
|
||||||
step_size epochs. Notice that such decay can happen simultaneously with
|
step_size epochs. Notice that such decay can happen simultaneously with
|
||||||
|
@ -61,7 +56,6 @@ class StepLR(_StepLR):
|
||||||
super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
|
super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch)
|
||||||
|
|
||||||
|
|
||||||
@LR_SCHEDULERS.register_module
|
|
||||||
class ExponentialLR(_ExponentialLR):
|
class ExponentialLR(_ExponentialLR):
|
||||||
"""Decays the learning rate of each parameter group by gamma every epoch.
|
"""Decays the learning rate of each parameter group by gamma every epoch.
|
||||||
When last_epoch=-1, sets initial lr as lr
|
When last_epoch=-1, sets initial lr as lr
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue