diff --git a/colossalai/auto_parallel/offload/base_offload_module.py b/colossalai/auto_parallel/offload/base_offload_module.py index d0c328e13..5b9f74b13 100644 --- a/colossalai/auto_parallel/offload/base_offload_module.py +++ b/colossalai/auto_parallel/offload/base_offload_module.py @@ -4,7 +4,7 @@ from typing import Optional, Set import torch import torch.nn as nn -from colossalai.nn.parallel.data_parallel import _cast_float +from colossalai.utils import _cast_float from colossalai.zero.legacy.gemini.tensor_utils import free_storage from .region_manager import RegionManager diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py index 1a90c72bd..730a90d74 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/registry.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/registry.py @@ -1,5 +1,4 @@ class Registry: - # TODO: refactor the registry classes used in colossalai.legacy.registry, colossalai.fx and here def __init__(self, name): self.name = name diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py index 6dadaba3e..3441eca38 100644 --- a/colossalai/checkpoint_io/utils.py +++ b/colossalai/checkpoint_io/utils.py @@ -11,8 +11,6 @@ from typing import Iterator, List, Mapping, Optional, OrderedDict, Tuple import torch import torch.nn as nn from torch.optim import Optimizer -from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype -from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model from colossalai.interface import ModelWrapper, OptimizerWrapper from colossalai.nn.optimizer import ColossalaiOptimizer @@ -383,6 +381,11 @@ def save_config_file(model: nn.Module, checkpoint_path: str, is_master: bool = T checkpoint_path (str): Path to the checkpoint directory. is_master (bool): Whether current rank is main process. """ + try: + from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype + from transformers.modeling_utils import unwrap_model as unwrap_huggingface_model + except ImportError: + return if not isinstance(model, PreTrainedModel): return diff --git a/colossalai/cli/benchmark/models.py b/colossalai/cli/benchmark/models.py index f8fd1c41a..385b485b6 100644 --- a/colossalai/cli/benchmark/models.py +++ b/colossalai/cli/benchmark/models.py @@ -1,6 +1,6 @@ import torch -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn class MLP(torch.nn.Module): diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py index e20c08b05..8eb4e0c88 100644 --- a/colossalai/kernel/jit/option.py +++ b/colossalai/kernel/jit/option.py @@ -1,6 +1,6 @@ import torch -from colossalai.nn.layer.colossalai_layer import Embedding, Linear +from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear from colossalai.utils import get_current_device from .bias_dropout_add import bias_dropout_add_fused_train diff --git a/colossalai/communication/__init__.py b/colossalai/legacy/communication/__init__.py similarity index 53% rename from colossalai/communication/__init__.py rename to colossalai/legacy/communication/__init__.py index 220481b7a..88ad0487b 100644 --- a/colossalai/communication/__init__.py +++ b/colossalai/legacy/communication/__init__.py @@ -1,9 +1,17 @@ -from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce -from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward, - send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward, - recv_forward, recv_backward) +from .collective import all_gather, all_reduce, broadcast, reduce, reduce_scatter +from .p2p import ( + recv_backward, + recv_forward, + send_backward, + send_backward_recv_backward, + send_backward_recv_forward, + send_forward, + send_forward_backward_recv_forward_backward, + send_forward_recv_backward, + send_forward_recv_forward, +) from .ring import ring_forward -from .utils import send_obj_meta, recv_obj_meta +from .utils import recv_obj_meta, send_obj_meta __all__ = [ 'all_gather', diff --git a/colossalai/communication/collective.py b/colossalai/legacy/communication/collective.py similarity index 100% rename from colossalai/communication/collective.py rename to colossalai/legacy/communication/collective.py diff --git a/colossalai/communication/p2p.py b/colossalai/legacy/communication/p2p.py similarity index 100% rename from colossalai/communication/p2p.py rename to colossalai/legacy/communication/p2p.py diff --git a/colossalai/communication/p2p_v2.py b/colossalai/legacy/communication/p2p_v2.py similarity index 100% rename from colossalai/communication/p2p_v2.py rename to colossalai/legacy/communication/p2p_v2.py diff --git a/colossalai/communication/ring.py b/colossalai/legacy/communication/ring.py similarity index 100% rename from colossalai/communication/ring.py rename to colossalai/legacy/communication/ring.py diff --git a/colossalai/communication/utils.py b/colossalai/legacy/communication/utils.py similarity index 100% rename from colossalai/communication/utils.py rename to colossalai/legacy/communication/utils.py diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule.py b/colossalai/legacy/engine/schedule/_pipeline_schedule.py index 88b54ce6a..4571fd679 100644 --- a/colossalai/legacy/engine/schedule/_pipeline_schedule.py +++ b/colossalai/legacy/engine/schedule/_pipeline_schedule.py @@ -6,7 +6,7 @@ from typing import Callable, List, Tuple, Union import torch.cuda -import colossalai.communication as comm +import colossalai.legacy.communication as comm from colossalai.amp.naive_amp import NaiveAMPModel from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc diff --git a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py index 9e7372b67..385c61537 100644 --- a/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py +++ b/colossalai/legacy/engine/schedule/_pipeline_schedule_v2.py @@ -5,10 +5,10 @@ from typing import Iterable, Tuple import torch.cuda -import colossalai.communication.p2p_v2 as comm -from colossalai import engine +import colossalai.legacy.communication.p2p_v2 as comm from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.engine import Engine from colossalai.utils.cuda import get_current_device from ._pipeline_schedule import PipelineSchedule @@ -60,7 +60,7 @@ class PipelineScheduleV2(PipelineSchedule): """ def forward_backward_step(self, - engine: engine.Engine, + engine: Engine, data_iter: Iterable, forward_only=False, return_loss=True, diff --git a/colossalai/legacy/nn/__init__.py b/colossalai/legacy/nn/__init__.py new file mode 100644 index 000000000..500162901 --- /dev/null +++ b/colossalai/legacy/nn/__init__.py @@ -0,0 +1,4 @@ +from ._ops import * +from .layer import * +from .loss import * +from .metric import * diff --git a/colossalai/nn/_ops/__init__.py b/colossalai/legacy/nn/_ops/__init__.py similarity index 100% rename from colossalai/nn/_ops/__init__.py rename to colossalai/legacy/nn/_ops/__init__.py diff --git a/colossalai/nn/_ops/_utils.py b/colossalai/legacy/nn/_ops/_utils.py similarity index 99% rename from colossalai/nn/_ops/_utils.py rename to colossalai/legacy/nn/_ops/_utils.py index 24877bbb5..131c21547 100644 --- a/colossalai/nn/_ops/_utils.py +++ b/colossalai/legacy/nn/_ops/_utils.py @@ -4,7 +4,7 @@ import torch import torch.distributed as dist from colossalai.global_variables import tensor_parallel_env as env -from colossalai.nn.layer.utils import divide +from colossalai.legacy.nn.layer.utils import divide from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup GeneralTensor = Union[ColoTensor, torch.Tensor] @@ -232,7 +232,7 @@ def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int): return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim) -### table wise embedding shard +# table wise embedding shard def _all_to_all_for_tablewise(x: torch.Tensor, diff --git a/colossalai/nn/_ops/addmm.py b/colossalai/legacy/nn/_ops/addmm.py similarity index 100% rename from colossalai/nn/_ops/addmm.py rename to colossalai/legacy/nn/_ops/addmm.py diff --git a/colossalai/nn/_ops/batch_norm.py b/colossalai/legacy/nn/_ops/batch_norm.py similarity index 100% rename from colossalai/nn/_ops/batch_norm.py rename to colossalai/legacy/nn/_ops/batch_norm.py diff --git a/colossalai/nn/_ops/element_wise.py b/colossalai/legacy/nn/_ops/element_wise.py similarity index 100% rename from colossalai/nn/_ops/element_wise.py rename to colossalai/legacy/nn/_ops/element_wise.py diff --git a/colossalai/nn/_ops/embedding.py b/colossalai/legacy/nn/_ops/embedding.py similarity index 98% rename from colossalai/nn/_ops/embedding.py rename to colossalai/legacy/nn/_ops/embedding.py index a045f305b..b145d1763 100644 --- a/colossalai/nn/_ops/embedding.py +++ b/colossalai/legacy/nn/_ops/embedding.py @@ -1,8 +1,10 @@ -import torch.nn.functional as F from typing import Optional + +import torch.nn.functional as F + +from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec from colossalai.tensor.op_wrapper import colo_op_impl -from colossalai.tensor import ComputePattern, ColoTensorSpec, ComputePattern, ComputeSpec, ColoTensor, ShardSpec, \ - ReplicaSpec + from ._utils import GeneralTensor, convert_to_colo_tensor, reduce_input diff --git a/colossalai/nn/_ops/embedding_bag.py b/colossalai/legacy/nn/_ops/embedding_bag.py similarity index 97% rename from colossalai/nn/_ops/embedding_bag.py rename to colossalai/legacy/nn/_ops/embedding_bag.py index 0026f579b..9a656d587 100644 --- a/colossalai/nn/_ops/embedding_bag.py +++ b/colossalai/legacy/nn/_ops/embedding_bag.py @@ -1,9 +1,11 @@ -import torch.nn.functional as F from typing import Optional + +import torch.nn.functional as F from torch import Tensor + +from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec from colossalai.tensor.op_wrapper import colo_op_impl -from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \ - ShardSpec, ReplicaSpec + from ._utils import GeneralTensor, convert_to_colo_tensor diff --git a/colossalai/nn/_ops/layernorm.py b/colossalai/legacy/nn/_ops/layernorm.py similarity index 92% rename from colossalai/nn/_ops/layernorm.py rename to colossalai/legacy/nn/_ops/layernorm.py index 2b761b84e..9960c5d48 100644 --- a/colossalai/nn/_ops/layernorm.py +++ b/colossalai/legacy/nn/_ops/layernorm.py @@ -1,7 +1,10 @@ from typing import List, Optional + import torch.nn.functional as F + +from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec from colossalai.tensor.op_wrapper import colo_op_impl -from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec + from ._utils import GeneralTensor, convert_to_colo_tensor diff --git a/colossalai/nn/_ops/linear.py b/colossalai/legacy/nn/_ops/linear.py similarity index 100% rename from colossalai/nn/_ops/linear.py rename to colossalai/legacy/nn/_ops/linear.py diff --git a/colossalai/nn/_ops/loss.py b/colossalai/legacy/nn/_ops/loss.py similarity index 96% rename from colossalai/nn/_ops/loss.py rename to colossalai/legacy/nn/_ops/loss.py index 1e54f6628..90efbfa36 100644 --- a/colossalai/nn/_ops/loss.py +++ b/colossalai/legacy/nn/_ops/loss.py @@ -1,9 +1,12 @@ +from typing import Optional + import torch import torch.nn.functional as F -from typing import Optional -from colossalai.tensor.op_wrapper import colo_op_impl + +from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D from colossalai.tensor import ColoTensor, ColoTensorSpec -from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D +from colossalai.tensor.op_wrapper import colo_op_impl + from ._utils import GeneralTensor, convert_to_colo_tensor diff --git a/colossalai/nn/_ops/view.py b/colossalai/legacy/nn/_ops/view.py similarity index 100% rename from colossalai/nn/_ops/view.py rename to colossalai/legacy/nn/_ops/view.py diff --git a/colossalai/legacy/nn/layer/__init__.py b/colossalai/legacy/nn/layer/__init__.py new file mode 100644 index 000000000..86961dd93 --- /dev/null +++ b/colossalai/legacy/nn/layer/__init__.py @@ -0,0 +1,9 @@ +from .colossalai_layer import * +from .parallel_1d import * +from .parallel_2d import * +from .parallel_2p5d import * +from .parallel_3d import * +from .parallel_sequence import * +from .utils import * +from .vanilla import * +from .wrapper import * diff --git a/colossalai/nn/layer/base_layer.py b/colossalai/legacy/nn/layer/base_layer.py similarity index 100% rename from colossalai/nn/layer/base_layer.py rename to colossalai/legacy/nn/layer/base_layer.py diff --git a/colossalai/nn/layer/colossalai_layer/__init__.py b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py similarity index 97% rename from colossalai/nn/layer/colossalai_layer/__init__.py rename to colossalai/legacy/nn/layer/colossalai_layer/__init__.py index 2ae1b07a7..ed743820d 100644 --- a/colossalai/nn/layer/colossalai_layer/__init__.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/__init__.py @@ -1,7 +1,7 @@ -from ._utils import partition_batch -from .dropout import Dropout -from .embedding import Embedding, PatchEmbedding -from .linear import Classifier, Linear -from .normalization import LayerNorm - -__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch'] +from ._utils import partition_batch +from .dropout import Dropout +from .embedding import Embedding, PatchEmbedding +from .linear import Classifier, Linear +from .normalization import LayerNorm + +__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch'] diff --git a/colossalai/nn/layer/colossalai_layer/_utils.py b/colossalai/legacy/nn/layer/colossalai_layer/_utils.py similarity index 100% rename from colossalai/nn/layer/colossalai_layer/_utils.py rename to colossalai/legacy/nn/layer/colossalai_layer/_utils.py diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/legacy/nn/layer/colossalai_layer/dropout.py similarity index 100% rename from colossalai/nn/layer/colossalai_layer/dropout.py rename to colossalai/legacy/nn/layer/colossalai_layer/dropout.py diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py similarity index 97% rename from colossalai/nn/layer/colossalai_layer/embedding.py rename to colossalai/legacy/nn/layer/colossalai_layer/embedding.py index e5c9c46e0..28bcb7ffe 100644 --- a/colossalai/nn/layer/colossalai_layer/embedding.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/embedding.py @@ -1,151 +1,152 @@ -import math -from typing import Callable - -from colossalai.utils import get_current_device -from torch import dtype, nn - -from ... import init as init -from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D -from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D -from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D -from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D -from ..utils import get_tensor_parallel_mode -from ..vanilla import VanillaPatchEmbedding -from ._utils import ColossalaiModule - -_parallel_embedding = { - '1d': Embedding1D, - '2d': Embedding2D, - '2.5d': Embedding2p5D, - '3d': Embedding3D, -} - -_vocab_parallel_embedding = { - '1d': VocabParallelEmbedding1D, - '2d': VocabParallelEmbedding2D, - '2.5d': VocabParallelEmbedding2p5D, - '3d': VocabParallelEmbedding3D -} - -_parallel_patchembedding = { - None: VanillaPatchEmbedding, - '1d': PatchEmbedding1D, - '2d': PatchEmbedding2D, - '2.5d': PatchEmbedding2p5D, - '3d': PatchEmbedding3D -} - - -class Embedding(ColossalaiModule): - r"""Embedding for colossalai. - - Args: - num_embeddings (int): number of embeddings. - embedding_dim (int): dimension of embedding. - padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; - therefore, the embedding vector at padding_idx is not updated during training, - i.e. it remains as a fixed “pad”, defaults to None. - dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. - weight_initializer (:class:`typing.Callable`, optional): - he initializer of weight, defaults to normal initializer. - - The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain: - :: - - max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is - renormalized to have norm max_norm. Note: this will modify weight in-place. - norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2. - scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse - of frequency of the words in the mini-batch. Default False. - sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False. - - More details about ``args`` and ``kwargs`` could be found in - `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_. - - More details about ``initializer`` please refer to - `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_ - """ - - def __init__(self, - num_embeddings: int, - embedding_dim: int, - padding_idx: int = None, - dtype: dtype = None, - weight_initializer: Callable = init.normal_(), - vocab_parallel_limit: int = 2048, - *args, - **kwargs) -> None: - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel is None: - embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args, - **kwargs).to(dtype).to(get_current_device()) - weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim) - elif num_embeddings <= vocab_parallel_limit: - embed = _parallel_embedding[tensor_parallel]( - num_embeddings, - embedding_dim, - padding_idx=padding_idx, - dtype=dtype, - weight_initializer=weight_initializer, - *args, - **kwargs, - ) - else: - embed = _vocab_parallel_embedding[tensor_parallel]( - num_embeddings, - embedding_dim, - padding_idx=padding_idx, - dtype=dtype, - weight_initializer=weight_initializer, - *args, - **kwargs, - ) - super().__init__(embed) - - -class PatchEmbedding(ColossalaiModule): - """2D Image to Patch Embedding. - - Args: - img_size (int): image size. - patch_size (int): patch size. - in_chans (int): number of channels of input image. - embed_size (int): size of embedding. - dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. - flatten (bool, optional): whether to flatten output tensor, defaults to True. - weight_initializer (:class:`typing.Callable`, optional): - The initializer of weight, defaults to kaiming uniform initializer. - bias_initializer (:class:`typing.Callable`, optional): - The initializer of bias, defaults to xavier uniform initializer. - position_embed_initializer (:class:`typing.Callable`, optional): - The initializer of position embedding, defaults to zeros initializer. - - More details about ``initializer`` please refer to - `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_. - """ - - def __init__( - self, - img_size: int, - patch_size: int, - in_chans: int, - embed_size: int, - dtype: dtype = None, - flatten: bool = True, - weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), - bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), - position_embed_initializer: Callable = init.zeros_() - ) -> None: - tensor_parallel = get_tensor_parallel_mode() - embed = _parallel_patchembedding[tensor_parallel]( - img_size, - patch_size, - in_chans, - embed_size, - dtype=dtype, - flatten=flatten, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - position_embed_initializer=position_embed_initializer, - ) - super().__init__(embed) +import math +from typing import Callable + +from torch import dtype, nn + +from colossalai.nn import init +from colossalai.utils import get_current_device + +from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D +from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D +from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D +from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D +from ..utils import get_tensor_parallel_mode +from ..vanilla import VanillaPatchEmbedding +from ._utils import ColossalaiModule + +_parallel_embedding = { + '1d': Embedding1D, + '2d': Embedding2D, + '2.5d': Embedding2p5D, + '3d': Embedding3D, +} + +_vocab_parallel_embedding = { + '1d': VocabParallelEmbedding1D, + '2d': VocabParallelEmbedding2D, + '2.5d': VocabParallelEmbedding2p5D, + '3d': VocabParallelEmbedding3D +} + +_parallel_patchembedding = { + None: VanillaPatchEmbedding, + '1d': PatchEmbedding1D, + '2d': PatchEmbedding2D, + '2.5d': PatchEmbedding2p5D, + '3d': PatchEmbedding3D +} + + +class Embedding(ColossalaiModule): + r"""Embedding for colossalai. + + Args: + num_embeddings (int): number of embeddings. + embedding_dim (int): dimension of embedding. + padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; + therefore, the embedding vector at padding_idx is not updated during training, + i.e. it remains as a fixed “pad”, defaults to None. + dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. + weight_initializer (:class:`typing.Callable`, optional): + he initializer of weight, defaults to normal initializer. + + The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain: + :: + + max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is + renormalized to have norm max_norm. Note: this will modify weight in-place. + norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2. + scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse + of frequency of the words in the mini-batch. Default False. + sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False. + + More details about ``args`` and ``kwargs`` could be found in + `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_. + + More details about ``initializer`` please refer to + `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_ + """ + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + padding_idx: int = None, + dtype: dtype = None, + weight_initializer: Callable = init.normal_(), + vocab_parallel_limit: int = 2048, + *args, + **kwargs) -> None: + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel is None: + embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args, + **kwargs).to(dtype).to(get_current_device()) + weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim) + elif num_embeddings <= vocab_parallel_limit: + embed = _parallel_embedding[tensor_parallel]( + num_embeddings, + embedding_dim, + padding_idx=padding_idx, + dtype=dtype, + weight_initializer=weight_initializer, + *args, + **kwargs, + ) + else: + embed = _vocab_parallel_embedding[tensor_parallel]( + num_embeddings, + embedding_dim, + padding_idx=padding_idx, + dtype=dtype, + weight_initializer=weight_initializer, + *args, + **kwargs, + ) + super().__init__(embed) + + +class PatchEmbedding(ColossalaiModule): + """2D Image to Patch Embedding. + + Args: + img_size (int): image size. + patch_size (int): patch size. + in_chans (int): number of channels of input image. + embed_size (int): size of embedding. + dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. + flatten (bool, optional): whether to flatten output tensor, defaults to True. + weight_initializer (:class:`typing.Callable`, optional): + The initializer of weight, defaults to kaiming uniform initializer. + bias_initializer (:class:`typing.Callable`, optional): + The initializer of bias, defaults to xavier uniform initializer. + position_embed_initializer (:class:`typing.Callable`, optional): + The initializer of position embedding, defaults to zeros initializer. + + More details about ``initializer`` please refer to + `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_. + """ + + def __init__( + self, + img_size: int, + patch_size: int, + in_chans: int, + embed_size: int, + dtype: dtype = None, + flatten: bool = True, + weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), + bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), + position_embed_initializer: Callable = init.zeros_() + ) -> None: + tensor_parallel = get_tensor_parallel_mode() + embed = _parallel_patchembedding[tensor_parallel]( + img_size, + patch_size, + in_chans, + embed_size, + dtype=dtype, + flatten=flatten, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + position_embed_initializer=position_embed_initializer, + ) + super().__init__(embed) diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/legacy/nn/layer/colossalai_layer/linear.py similarity index 99% rename from colossalai/nn/layer/colossalai_layer/linear.py rename to colossalai/legacy/nn/layer/colossalai_layer/linear.py index 3e0c6e285..c05ceb66c 100644 --- a/colossalai/nn/layer/colossalai_layer/linear.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/linear.py @@ -4,9 +4,9 @@ from typing import Callable from torch import dtype, nn +from colossalai.nn import init from colossalai.utils import get_current_device -from ... import init as init from ..parallel_1d import * from ..parallel_2d import * from ..parallel_2p5d import * diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py similarity index 97% rename from colossalai/nn/layer/colossalai_layer/normalization.py rename to colossalai/legacy/nn/layer/colossalai_layer/normalization.py index 86861d302..f8e317e72 100644 --- a/colossalai/nn/layer/colossalai_layer/normalization.py +++ b/colossalai/legacy/nn/layer/colossalai_layer/normalization.py @@ -1,41 +1,42 @@ -from colossalai.utils import get_current_device -from torch import nn - -from ..parallel_1d import LayerNorm1D -from ..parallel_2d import LayerNorm2D -from ..parallel_2p5d import LayerNorm2p5D -from ..parallel_3d import LayerNorm3D -from ..utils import get_tensor_parallel_mode -from ..vanilla import VanillaLayerNorm -from ._utils import ColossalaiModule - -_parallel_layernorm = { - None: VanillaLayerNorm, - "1d": LayerNorm1D, - "2d": LayerNorm2D, - "2.5d": LayerNorm2p5D, - "3d": LayerNorm3D, -} - - -class LayerNorm(ColossalaiModule): - r"""Layer Normalization for colossalai. - - Args: - normalized_shape (int): input shape from an expected input of size. - :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] - \times \ldots \times \text{normalized_shape}[-1]]` - If a single integer is used, it is treated as a singleton list, and this module will - normalize over the last dimension which is expected to be of that specific size. - eps (float): a value added to the denominator for numerical stability, defaults to 1e-05. - bias (bool, optional): Whether to add a bias, defaults to ``True``. - dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. - """ - - def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None: - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel is None: - norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device()) - else: - norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype) - super().__init__(norm) +from torch import nn + +from colossalai.utils import get_current_device + +from ..parallel_1d import LayerNorm1D +from ..parallel_2d import LayerNorm2D +from ..parallel_2p5d import LayerNorm2p5D +from ..parallel_3d import LayerNorm3D +from ..utils import get_tensor_parallel_mode +from ..vanilla import VanillaLayerNorm +from ._utils import ColossalaiModule + +_parallel_layernorm = { + None: VanillaLayerNorm, + "1d": LayerNorm1D, + "2d": LayerNorm2D, + "2.5d": LayerNorm2p5D, + "3d": LayerNorm3D, +} + + +class LayerNorm(ColossalaiModule): + r"""Layer Normalization for colossalai. + + Args: + normalized_shape (int): input shape from an expected input of size. + :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] + \times \ldots \times \text{normalized_shape}[-1]]` + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + eps (float): a value added to the denominator for numerical stability, defaults to 1e-05. + bias (bool, optional): Whether to add a bias, defaults to ``True``. + dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. + """ + + def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None: + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel is None: + norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device()) + else: + norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype) + super().__init__(norm) diff --git a/colossalai/legacy/nn/layer/parallel_1d/__init__.py b/colossalai/legacy/nn/layer/parallel_1d/__init__.py new file mode 100644 index 000000000..9cffd4d33 --- /dev/null +++ b/colossalai/legacy/nn/layer/parallel_1d/__init__.py @@ -0,0 +1,17 @@ +from .layers import ( + Classifier1D, + Dropout1D, + Embedding1D, + LayerNorm1D, + Linear1D, + Linear1D_Col, + Linear1D_Row, + PatchEmbedding1D, + VocabParallelClassifier1D, + VocabParallelEmbedding1D, +) + +__all__ = [ + 'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D', + 'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D' +] diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py similarity index 100% rename from colossalai/nn/layer/parallel_1d/_operation.py rename to colossalai/legacy/nn/layer/parallel_1d/_operation.py diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/legacy/nn/layer/parallel_1d/_utils.py similarity index 99% rename from colossalai/nn/layer/parallel_1d/_utils.py rename to colossalai/legacy/nn/layer/parallel_1d/_utils.py index 1212d5956..fddf4e73d 100644 --- a/colossalai/nn/layer/parallel_1d/_utils.py +++ b/colossalai/legacy/nn/layer/parallel_1d/_utils.py @@ -3,6 +3,7 @@ import torch import torch.distributed as dist + from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env @@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function): class _SplitForwardGatherBackward(torch.autograd.Function): """ Split the input and keep only the corresponding chuck to the rank. - + Args: input_: input matrix. parallel_mode: parallel mode. diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/legacy/nn/layer/parallel_1d/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_1d/layers.py rename to colossalai/legacy/nn/layer/parallel_1d/layers.py index 7b129009e..c0a169c15 100644 --- a/colossalai/nn/layer/parallel_1d/layers.py +++ b/colossalai/legacy/nn/layer/parallel_1d/layers.py @@ -10,11 +10,11 @@ import torch.nn.functional as F from torch import Tensor from torch.nn.parameter import Parameter -from colossalai.communication import broadcast from colossalai.context import ParallelMode, seed from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env from colossalai.kernel import LayerNorm +from colossalai.legacy.communication import broadcast from colossalai.legacy.registry import LAYERS from colossalai.nn import init as init from colossalai.utils.checkpointing import ( diff --git a/colossalai/nn/layer/parallel_2d/__init__.py b/colossalai/legacy/nn/layer/parallel_2d/__init__.py similarity index 59% rename from colossalai/nn/layer/parallel_2d/__init__.py rename to colossalai/legacy/nn/layer/parallel_2d/__init__.py index 5562d1a70..9c65f3608 100644 --- a/colossalai/nn/layer/parallel_2d/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_2d/__init__.py @@ -1,6 +1,13 @@ from ._operation import reduce_by_batch_2d, split_batch_2d -from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D, - VocabParallelEmbedding2D) +from .layers import ( + Classifier2D, + Embedding2D, + LayerNorm2D, + Linear2D, + PatchEmbedding2D, + VocabParallelClassifier2D, + VocabParallelEmbedding2D, +) __all__ = [ 'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D', diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/legacy/nn/layer/parallel_2d/_operation.py similarity index 98% rename from colossalai/nn/layer/parallel_2d/_operation.py rename to colossalai/legacy/nn/layer/parallel_2d/_operation.py index 306577dbd..fa9b49bcf 100644 --- a/colossalai/nn/layer/parallel_2d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2d/_operation.py @@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple import torch import torch.distributed as dist -from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter) -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.utils import get_current_device from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd + +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env +from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter +from colossalai.utils import get_current_device def matmul_2d( @@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function): col_group = gpc.get_group(col_parallel_mode) src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ - pipeline_parallel_rank * tensor_parallel_size + pipeline_parallel_rank * tensor_parallel_size src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ - pipeline_parallel_rank * tensor_parallel_size + pipeline_parallel_rank * tensor_parallel_size opa = [None] * 2 opb = [None] * 2 @@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function): col_group = gpc.get_group(col_parallel_mode) src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ - pipeline_parallel_rank * tensor_parallel_size + pipeline_parallel_rank * tensor_parallel_size src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ - pipeline_parallel_rank * tensor_parallel_size + pipeline_parallel_rank * tensor_parallel_size opb = [None] * 2 opr = [None] * 2 @@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function): col_group = gpc.get_group(col_parallel_mode) src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ - pipeline_parallel_rank * tensor_parallel_size + pipeline_parallel_rank * tensor_parallel_size src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \ - pipeline_parallel_rank * tensor_parallel_size + pipeline_parallel_rank * tensor_parallel_size opa = [None] * 2 opr = [None] * 2 diff --git a/colossalai/nn/layer/parallel_2d/_utils.py b/colossalai/legacy/nn/layer/parallel_2d/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_2d/_utils.py rename to colossalai/legacy/nn/layer/parallel_2d/_utils.py diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/legacy/nn/layer/parallel_2d/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_2d/layers.py rename to colossalai/legacy/nn/layer/parallel_2d/layers.py index 1a01d5437..b458d15c7 100644 --- a/colossalai/nn/layer/parallel_2d/layers.py +++ b/colossalai/legacy/nn/layer/parallel_2d/layers.py @@ -8,10 +8,10 @@ import torch.nn.functional as F from torch import Tensor from torch.nn import Parameter -from colossalai.communication import broadcast from colossalai.context import ParallelMode, seed from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env +from colossalai.legacy.communication import broadcast from colossalai.legacy.registry import LAYERS from colossalai.nn import init as init from colossalai.utils.checkpointing import gather_tensor_parallel_state_dict, partition_tensor_parallel_state_dict diff --git a/colossalai/nn/layer/parallel_2p5d/__init__.py b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py similarity index 59% rename from colossalai/nn/layer/parallel_2p5d/__init__.py rename to colossalai/legacy/nn/layer/parallel_2p5d/__init__.py index bec3b1c4b..23e47e6ed 100644 --- a/colossalai/nn/layer/parallel_2p5d/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/__init__.py @@ -1,6 +1,13 @@ from ._operation import reduce_by_batch_2p5d, split_batch_2p5d -from .layers import (Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D, - VocabParallelClassifier2p5D, VocabParallelEmbedding2p5D) +from .layers import ( + Classifier2p5D, + Embedding2p5D, + LayerNorm2p5D, + Linear2p5D, + PatchEmbedding2p5D, + VocabParallelClassifier2p5D, + VocabParallelEmbedding2p5D, +) __all__ = [ 'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D', diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py similarity index 99% rename from colossalai/nn/layer/parallel_2p5d/_operation.py rename to colossalai/legacy/nn/layer/parallel_2p5d/_operation.py index 5a0f537cd..55defa4a3 100644 --- a/colossalai/nn/layer/parallel_2p5d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/_operation.py @@ -2,13 +2,14 @@ from typing import Any, Tuple import torch import torch.distributed as dist -from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter) -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.utils import get_current_device from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce_scatter +from colossalai.utils import get_current_device + def get_parallel_group(parallel_mode: ParallelMode): return gpc.get_group(parallel_mode) diff --git a/colossalai/nn/layer/parallel_2p5d/_utils.py b/colossalai/legacy/nn/layer/parallel_2p5d/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_2p5d/_utils.py rename to colossalai/legacy/nn/layer/parallel_2p5d/_utils.py diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_2p5d/layers.py rename to colossalai/legacy/nn/layer/parallel_2p5d/layers.py index 62c4292fd..04acc2bb0 100644 --- a/colossalai/nn/layer/parallel_2p5d/layers.py +++ b/colossalai/legacy/nn/layer/parallel_2p5d/layers.py @@ -8,10 +8,10 @@ import torch.nn.functional as F from torch import Tensor from torch.nn import Parameter -from colossalai.communication import broadcast from colossalai.context import ParallelMode, seed from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env +from colossalai.legacy.communication import broadcast from colossalai.legacy.registry import LAYERS from colossalai.nn import init as init from colossalai.utils.checkpointing import ( diff --git a/colossalai/nn/layer/parallel_3d/__init__.py b/colossalai/legacy/nn/layer/parallel_3d/__init__.py similarity index 62% rename from colossalai/nn/layer/parallel_3d/__init__.py rename to colossalai/legacy/nn/layer/parallel_3d/__init__.py index 9ae255b44..17fe8403c 100644 --- a/colossalai/nn/layer/parallel_3d/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_3d/__init__.py @@ -1,6 +1,13 @@ from ._operation import reduce_by_batch_3d, split_batch_3d, split_tensor_3d -from .layers import (Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D, VocabParallelClassifier3D, - VocabParallelEmbedding3D) +from .layers import ( + Classifier3D, + Embedding3D, + LayerNorm3D, + Linear3D, + PatchEmbedding3D, + VocabParallelClassifier3D, + VocabParallelEmbedding3D, +) __all__ = [ 'reduce_by_batch_3d', 'split_tensor_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D', diff --git a/colossalai/nn/layer/parallel_3d/_operation.py b/colossalai/legacy/nn/layer/parallel_3d/_operation.py similarity index 99% rename from colossalai/nn/layer/parallel_3d/_operation.py rename to colossalai/legacy/nn/layer/parallel_3d/_operation.py index 5dc9a2428..ca0b0e627 100755 --- a/colossalai/nn/layer/parallel_3d/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_3d/_operation.py @@ -7,10 +7,10 @@ import torch from torch import Tensor from torch.cuda.amp import custom_bwd, custom_fwd -from colossalai.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.communication import all_gather, all_reduce, broadcast, reduce, reduce_scatter from ._utils import get_parallel_mode_from_env, push_async_grad diff --git a/colossalai/nn/layer/parallel_3d/_utils.py b/colossalai/legacy/nn/layer/parallel_3d/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_3d/_utils.py rename to colossalai/legacy/nn/layer/parallel_3d/_utils.py diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/legacy/nn/layer/parallel_3d/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_3d/layers.py rename to colossalai/legacy/nn/layer/parallel_3d/layers.py index 7d940aa27..b815a842c 100644 --- a/colossalai/nn/layer/parallel_3d/layers.py +++ b/colossalai/legacy/nn/layer/parallel_3d/layers.py @@ -8,14 +8,14 @@ import torch.nn.functional as F from torch import Tensor from torch.nn import Parameter -from colossalai.communication import all_reduce, broadcast from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D from colossalai.context import ParallelMode, seed from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env +from colossalai.legacy.communication import all_reduce, broadcast +from colossalai.legacy.nn.layer.base_layer import ParallelLayer from colossalai.legacy.registry import LAYERS from colossalai.nn import init as init -from colossalai.nn.layer.base_layer import ParallelLayer from colossalai.utils.checkpointing import ( broadcast_state_dict, gather_tensor_parallel_state_dict, diff --git a/colossalai/nn/layer/parallel_sequence/__init__.py b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py similarity index 74% rename from colossalai/nn/layer/parallel_sequence/__init__.py rename to colossalai/legacy/nn/layer/parallel_sequence/__init__.py index 4fa9eed6f..d92d66d40 100644 --- a/colossalai/nn/layer/parallel_sequence/__init__.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/__init__.py @@ -1,4 +1,4 @@ -from ._operation import RingQK, RingAV +from ._operation import RingAV, RingQK from .layers import TransformerSelfAttentionRing __all__ = ['TransformerSelfAttentionRing', 'RingAV', 'RingQK'] diff --git a/colossalai/nn/layer/parallel_sequence/_operation.py b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py similarity index 97% rename from colossalai/nn/layer/parallel_sequence/_operation.py rename to colossalai/legacy/nn/layer/parallel_sequence/_operation.py index fc8049422..fcf296201 100644 --- a/colossalai/nn/layer/parallel_sequence/_operation.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/_operation.py @@ -3,13 +3,13 @@ import torch from torch import distributed as dist +from torch.cuda.amp import custom_bwd, custom_fwd -from colossalai.communication import ring_forward from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.layer.parallel_sequence._utils import _calc_incoming_device_range, _calc_current_device_range +from colossalai.legacy.communication import ring_forward +from colossalai.legacy.nn.layer.parallel_sequence._utils import _calc_current_device_range, _calc_incoming_device_range from colossalai.utils import get_current_device -from torch.cuda.amp import custom_bwd, custom_fwd class RingQK(torch.autograd.Function): diff --git a/colossalai/nn/layer/parallel_sequence/_utils.py b/colossalai/legacy/nn/layer/parallel_sequence/_utils.py similarity index 100% rename from colossalai/nn/layer/parallel_sequence/_utils.py rename to colossalai/legacy/nn/layer/parallel_sequence/_utils.py diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/legacy/nn/layer/parallel_sequence/layers.py similarity index 99% rename from colossalai/nn/layer/parallel_sequence/layers.py rename to colossalai/legacy/nn/layer/parallel_sequence/layers.py index 4d0ff2e06..e44e61c2f 100644 --- a/colossalai/nn/layer/parallel_sequence/layers.py +++ b/colossalai/legacy/nn/layer/parallel_sequence/layers.py @@ -14,8 +14,8 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.kernel import FusedScaleMaskSoftmax from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType +from colossalai.legacy.nn.layer.parallel_sequence._operation import RingAV, RingQK from colossalai.legacy.registry import LAYERS -from colossalai.nn.layer.parallel_sequence._operation import RingAV, RingQK @LAYERS.register_module diff --git a/colossalai/legacy/nn/layer/utils/__init__.py b/colossalai/legacy/nn/layer/utils/__init__.py new file mode 100644 index 000000000..56e969bfd --- /dev/null +++ b/colossalai/legacy/nn/layer/utils/__init__.py @@ -0,0 +1,15 @@ +from .common import ( + ACT2FN, + CheckpointModule, + _ntuple, + divide, + get_tensor_parallel_mode, + set_tensor_parallel_attribute_by_partition, + set_tensor_parallel_attribute_by_size, + to_2tuple, +) + +__all__ = [ + 'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size', + 'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple' +] diff --git a/colossalai/nn/layer/utils/common.py b/colossalai/legacy/nn/layer/utils/common.py similarity index 99% rename from colossalai/nn/layer/utils/common.py rename to colossalai/legacy/nn/layer/utils/common.py index f2297304f..d8f3ad2a7 100644 --- a/colossalai/nn/layer/utils/common.py +++ b/colossalai/legacy/nn/layer/utils/common.py @@ -6,10 +6,11 @@ from itertools import repeat import numpy as np import torch +from torch import Tensor, nn + from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS from colossalai.global_variables import tensor_parallel_env as env from colossalai.utils import checkpoint -from torch import Tensor, nn class CheckpointModule(nn.Module): diff --git a/colossalai/nn/layer/vanilla/__init__.py b/colossalai/legacy/nn/layer/vanilla/__init__.py similarity index 100% rename from colossalai/nn/layer/vanilla/__init__.py rename to colossalai/legacy/nn/layer/vanilla/__init__.py diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/legacy/nn/layer/vanilla/layers.py similarity index 100% rename from colossalai/nn/layer/vanilla/layers.py rename to colossalai/legacy/nn/layer/vanilla/layers.py diff --git a/colossalai/nn/layer/wrapper/__init__.py b/colossalai/legacy/nn/layer/wrapper/__init__.py similarity index 100% rename from colossalai/nn/layer/wrapper/__init__.py rename to colossalai/legacy/nn/layer/wrapper/__init__.py diff --git a/colossalai/nn/layer/wrapper/pipeline_wrapper.py b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py similarity index 99% rename from colossalai/nn/layer/wrapper/pipeline_wrapper.py rename to colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py index ef1d794cc..68fea8622 100644 --- a/colossalai/nn/layer/wrapper/pipeline_wrapper.py +++ b/colossalai/legacy/nn/layer/wrapper/pipeline_wrapper.py @@ -1,6 +1,8 @@ -import torch.nn as nn -import torch.distributed as dist from typing import List, Tuple, Union + +import torch.distributed as dist +import torch.nn as nn + from colossalai.context import ParallelMode from colossalai.core import global_context as gpc diff --git a/colossalai/legacy/nn/loss/__init__.py b/colossalai/legacy/nn/loss/__init__.py new file mode 100644 index 000000000..1bd8872d9 --- /dev/null +++ b/colossalai/legacy/nn/loss/__init__.py @@ -0,0 +1,41 @@ +from torch import nn +from torch.nn.modules.loss import * +from torch.nn.modules.loss import _Loss + +from colossalai.global_variables import tensor_parallel_env as env +from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode + +from .loss_1d import VocabParallelCrossEntropyLoss1D +from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D +from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D +from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D + +_parallel_cross_entropy = { + '2d': CrossEntropyLoss2D, + '2.5d': CrossEntropyLoss2p5D, + '3d': CrossEntropyLoss3D, +} + +_vocab_parallel_cross_entropy = { + '1d': VocabParallelCrossEntropyLoss1D, + '2d': VocabParallelCrossEntropyLoss2D, + '2.5d': VocabParallelCrossEntropyLoss2p5D, + '3d': VocabParallelCrossEntropyLoss3D, +} + + +class CrossEntropyLoss(_Loss): + + def __init__(self, reduction: bool = True, *args, **kwargs): + super().__init__() + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel is not None and env.vocab_parallel: + self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) + elif tensor_parallel is None or tensor_parallel == '1d': + reduction = 'mean' if reduction else 'none' + self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs) + else: + self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) + + def forward(self, *args): + return self.loss(*args) diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/legacy/nn/loss/loss_1d.py similarity index 100% rename from colossalai/nn/loss/loss_1d.py rename to colossalai/legacy/nn/loss/loss_1d.py diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/legacy/nn/loss/loss_2d.py similarity index 97% rename from colossalai/nn/loss/loss_2d.py rename to colossalai/legacy/nn/loss/loss_2d.py index 6db40c0f3..6191602b7 100644 --- a/colossalai/nn/loss/loss_2d.py +++ b/colossalai/legacy/nn/loss/loss_2d.py @@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d +from colossalai.legacy.nn.layer.parallel_2d._utils import assert_summa_initialization from colossalai.legacy.registry import LOSSES -from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d -from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization from colossalai.utils import get_current_device diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/legacy/nn/loss/loss_2p5d.py similarity index 96% rename from colossalai/nn/loss/loss_2p5d.py rename to colossalai/legacy/nn/loss/loss_2p5d.py index 9c78a1ef0..2746b2011 100644 --- a/colossalai/nn/loss/loss_2p5d.py +++ b/colossalai/legacy/nn/loss/loss_2p5d.py @@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d +from colossalai.legacy.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization from colossalai.legacy.registry import LOSSES -from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d -from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization from colossalai.utils import get_current_device diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/legacy/nn/loss/loss_3d.py similarity index 97% rename from colossalai/nn/loss/loss_3d.py rename to colossalai/legacy/nn/loss/loss_3d.py index 5c0f26640..2aeb1bd98 100644 --- a/colossalai/nn/loss/loss_3d.py +++ b/colossalai/legacy/nn/loss/loss_3d.py @@ -6,9 +6,9 @@ from torch.nn.modules.loss import _Loss from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d +from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.legacy.registry import LOSSES -from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d -from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env from colossalai.utils import get_current_device diff --git a/colossalai/nn/metric/__init__.py b/colossalai/legacy/nn/metric/__init__.py similarity index 87% rename from colossalai/nn/metric/__init__.py rename to colossalai/legacy/nn/metric/__init__.py index 00833b611..76c6dac89 100644 --- a/colossalai/nn/metric/__init__.py +++ b/colossalai/legacy/nn/metric/__init__.py @@ -1,26 +1,28 @@ -from torch import nn - -from ._utils import calc_acc -from .accuracy_2d import Accuracy2D -from .accuracy_2p5d import Accuracy2p5D -from .accuracy_3d import Accuracy3D -from colossalai.nn.layer.utils import get_tensor_parallel_mode - -_parallel_accuracy = { - '2d': Accuracy2D, - '2.5d': Accuracy2p5D, - '3d': Accuracy3D, -} - - -class Accuracy(nn.Module): - def __init__(self): - super().__init__() - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel not in _parallel_accuracy: - self.acc = calc_acc - else: - self.acc = _parallel_accuracy[tensor_parallel]() - - def forward(self, *args): - return self.acc(*args) +from torch import nn + +from colossalai.legacy.nn.layer.utils import get_tensor_parallel_mode + +from ._utils import calc_acc +from .accuracy_2d import Accuracy2D +from .accuracy_2p5d import Accuracy2p5D +from .accuracy_3d import Accuracy3D + +_parallel_accuracy = { + '2d': Accuracy2D, + '2.5d': Accuracy2p5D, + '3d': Accuracy3D, +} + + +class Accuracy(nn.Module): + + def __init__(self): + super().__init__() + tensor_parallel = get_tensor_parallel_mode() + if tensor_parallel not in _parallel_accuracy: + self.acc = calc_acc + else: + self.acc = _parallel_accuracy[tensor_parallel]() + + def forward(self, *args): + return self.acc(*args) diff --git a/colossalai/nn/metric/_utils.py b/colossalai/legacy/nn/metric/_utils.py similarity index 95% rename from colossalai/nn/metric/_utils.py rename to colossalai/legacy/nn/metric/_utils.py index eac591b64..8706ffc10 100644 --- a/colossalai/nn/metric/_utils.py +++ b/colossalai/legacy/nn/metric/_utils.py @@ -1,7 +1,7 @@ -import torch - - -def calc_acc(logits, targets): - preds = torch.argmax(logits, dim=-1) - correct = torch.sum(targets == preds) - return correct +import torch + + +def calc_acc(logits, targets): + preds = torch.argmax(logits, dim=-1) + correct = torch.sum(targets == preds) + return correct diff --git a/colossalai/nn/metric/accuracy_2d.py b/colossalai/legacy/nn/metric/accuracy_2d.py similarity index 89% rename from colossalai/nn/metric/accuracy_2d.py rename to colossalai/legacy/nn/metric/accuracy_2d.py index a86832973..838c48834 100644 --- a/colossalai/nn/metric/accuracy_2d.py +++ b/colossalai/legacy/nn/metric/accuracy_2d.py @@ -1,7 +1,8 @@ import torch -from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d from torch import nn +from colossalai.legacy.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d + from ._utils import calc_acc diff --git a/colossalai/nn/metric/accuracy_2p5d.py b/colossalai/legacy/nn/metric/accuracy_2p5d.py similarity index 88% rename from colossalai/nn/metric/accuracy_2p5d.py rename to colossalai/legacy/nn/metric/accuracy_2p5d.py index 3044da065..183380cd9 100644 --- a/colossalai/nn/metric/accuracy_2p5d.py +++ b/colossalai/legacy/nn/metric/accuracy_2p5d.py @@ -1,7 +1,8 @@ import torch -from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d from torch import nn +from colossalai.legacy.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d + from ._utils import calc_acc diff --git a/colossalai/nn/metric/accuracy_3d.py b/colossalai/legacy/nn/metric/accuracy_3d.py similarity index 85% rename from colossalai/nn/metric/accuracy_3d.py rename to colossalai/legacy/nn/metric/accuracy_3d.py index 5506fc1d2..1aaac73ec 100644 --- a/colossalai/nn/metric/accuracy_3d.py +++ b/colossalai/legacy/nn/metric/accuracy_3d.py @@ -1,33 +1,35 @@ -import torch -from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D -from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d -from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env -from torch import nn - -from ._utils import calc_acc - - -class Accuracy3D(nn.Module): - """Accuracy for 3D parallelism - """ - def __init__(self): - super().__init__() - self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) - self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) - - def forward(self, logits, targets): - """Calculate the accuracy of predicted labels. - - Args: - logits (:class:`torch.tensor`): Predicted labels. - targets (:class:`torch.tensor`): True labels from data. - - Returns: - float: the accuracy of prediction. - """ - with torch.no_grad(): - targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) - targets = split_tensor_3d(targets, 0, self.input_parallel_mode) - correct = calc_acc(logits, targets) - correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode) - return correct +import torch +from torch import nn + +from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D +from colossalai.legacy.nn.layer.parallel_3d import reduce_by_batch_3d, split_tensor_3d +from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env + +from ._utils import calc_acc + + +class Accuracy3D(nn.Module): + """Accuracy for 3D parallelism + """ + + def __init__(self): + super().__init__() + self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) + self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D) + + def forward(self, logits, targets): + """Calculate the accuracy of predicted labels. + + Args: + logits (:class:`torch.tensor`): Predicted labels. + targets (:class:`torch.tensor`): True labels from data. + + Returns: + float: the accuracy of prediction. + """ + with torch.no_grad(): + targets = split_tensor_3d(targets, 0, self.weight_parallel_mode) + targets = split_tensor_3d(targets, 0, self.input_parallel_mode) + correct = calc_acc(logits, targets) + correct = reduce_by_batch_3d(correct, self.input_parallel_mode, self.weight_parallel_mode) + return correct diff --git a/colossalai/nn/parallel/__init__.py b/colossalai/legacy/nn/parallel/__init__.py similarity index 100% rename from colossalai/nn/parallel/__init__.py rename to colossalai/legacy/nn/parallel/__init__.py diff --git a/colossalai/nn/parallel/data_parallel.py b/colossalai/legacy/nn/parallel/data_parallel.py similarity index 100% rename from colossalai/nn/parallel/data_parallel.py rename to colossalai/legacy/nn/parallel/data_parallel.py diff --git a/colossalai/nn/parallel/layers/__init__.py b/colossalai/legacy/nn/parallel/layers/__init__.py similarity index 56% rename from colossalai/nn/parallel/layers/__init__.py rename to colossalai/legacy/nn/parallel/layers/__init__.py index 29b8353e6..f38124efe 100644 --- a/colossalai/nn/parallel/layers/__init__.py +++ b/colossalai/legacy/nn/parallel/layers/__init__.py @@ -1,10 +1,17 @@ +from .cache_embedding import ( + CachedEmbeddingBag, + CachedParamMgr, + EvictionStrategy, + LimitBuffIndexCopyer, + ParallelCachedEmbeddingBag, + ParallelCachedEmbeddingBagTablewise, + ParallelCachedEmbeddingBagTablewiseSpiltCache, + TablewiseEmbeddingBagConfig, +) from .colo_module import ColoModule -from .linear import ColoLinear from .embedding import ColoEmbedding -from .module_utils import register_colo_module, is_colo_module, get_colo_module, init_colo_module, check_colo_module - -from .cache_embedding import CachedEmbeddingBag, ParallelCachedEmbeddingBag, CachedParamMgr, LimitBuffIndexCopyer, EvictionStrategy, \ - ParallelCachedEmbeddingBagTablewise, TablewiseEmbeddingBagConfig, ParallelCachedEmbeddingBagTablewiseSpiltCache +from .linear import ColoLinear +from .module_utils import check_colo_module, get_colo_module, init_colo_module, is_colo_module, register_colo_module __all__ = [ 'ColoModule', 'register_colo_module', 'is_colo_module', 'get_colo_module', 'init_colo_module', 'check_colo_module', diff --git a/colossalai/nn/parallel/layers/cache_embedding/__init__.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py similarity index 100% rename from colossalai/nn/parallel/layers/cache_embedding/__init__.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py index 5bbc931a7..d87930c1c 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/__init__.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/__init__.py @@ -1,8 +1,8 @@ from .cache_mgr import CachedParamMgr, EvictionStrategy -from .copyer import LimitBuffIndexCopyer from .cached_embedding import CachedEmbeddingBag -from .parallel_cached_embedding import ParallelCachedEmbeddingBag +from .copyer import LimitBuffIndexCopyer from .embedding_config import TablewiseEmbeddingBagConfig +from .parallel_cached_embedding import ParallelCachedEmbeddingBag from .parallel_cached_embedding_tablewise import ParallelCachedEmbeddingBagTablewise from .parallel_cached_embedding_tablewise_split_cache import ParallelCachedEmbeddingBagTablewiseSpiltCache diff --git a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/base_embedding.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py index 705835a0e..9558c541e 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/base_embedding.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/base_embedding.py @@ -1,4 +1,5 @@ import abc + import torch.nn as nn diff --git a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py index a6159856d..16530c4ce 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cache_mgr.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cache_mgr.py @@ -1,12 +1,14 @@ -import numpy as np -import torch -from torch.profiler import record_function -from typing import List, Optional -from contexttimer import Timer -from .copyer import LimitBuffIndexCopyer -from enum import Enum import sys from contextlib import contextmanager +from enum import Enum +from typing import List, Optional + +import numpy as np +import torch +from contexttimer import Timer +from torch.profiler import record_function + +from .copyer import LimitBuffIndexCopyer class EvictionStrategy(Enum): @@ -35,7 +37,7 @@ def _wait_for_data(t, stream: Optional[torch.cuda.streams.Stream]) -> None: class CachedParamMgr(torch.nn.Module): """ Manage Embedding Weights on CPU and CUDA memory uses a software cache. - CPU maintains the entire original weight. + CPU maintains the entire original weight. CUDA maintains a fraction of the weights used in the upcoming computation. The row number in CUDA is controlled by `cuda_row_num`. During training, GPU needs to transmit embedding rows between CPU and GPU. Args: @@ -115,7 +117,7 @@ class CachedParamMgr(torch.nn.Module): self._elapsed_dict[name] += t.elapsed def _find_evict_gpu_idxs(self, evict_num: int) -> torch.Tensor: - """_find_evict_gpu_idxs + """_find_evict_gpu_idxs Find the gpu idxs to be evicted, according to their freq. Args: evict_num (int): how many rows has to be evicted @@ -202,7 +204,7 @@ class CachedParamMgr(torch.nn.Module): """reorder reorder the weight according to ids' frequency in dataset before training. Execute only once before training, also known as warmup phase. - + Note: If you would like to use the DATASET as the eviction strategy, you must call this function. Note: @@ -516,7 +518,7 @@ class CachedParamMgr(torch.nn.Module): """ deprecated evict one row from cuda to cpu. - Returns: + Returns: (int) : the slot id be evicted. """ mask = torch.logical_or(torch.isin(self.cached_idx_map, self.evict_backlist), self.cached_idx_map == -1) diff --git a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py similarity index 98% rename from colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py index a74cb8d94..bc7d17890 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/cached_embedding.py @@ -1,10 +1,11 @@ +from typing import Iterator, List, Optional, Tuple, Union + import torch import torch.nn.functional as F -from typing import List, Optional, Iterator, Tuple, Union +from torch.nn.parameter import Parameter from .base_embedding import BaseEmbeddingBag from .cache_mgr import CachedParamMgr, EvictionStrategy -from torch.nn.parameter import Parameter class CachedEmbeddingBag(BaseEmbeddingBag): @@ -27,7 +28,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag): include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False. dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32. device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu. - cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row + cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None. warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7. buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0. @@ -85,10 +86,10 @@ class CachedEmbeddingBag(BaseEmbeddingBag): buffer_size=50_000, pin_weight=False): """ - Called after initialized. + Called after initialized. Reorder the weight rows according to the ids_freq_mapping. Then, let the weights of the Module be managed by a CachedParamMgr. - + Args: cuda_row_num (int): number of rows can be hosted in CUDA memory ids_freq_mapping (List[int]): a list, idx is id number, value is freq diff --git a/colossalai/nn/parallel/layers/cache_embedding/copyer.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py similarity index 97% rename from colossalai/nn/parallel/layers/cache_embedding/copyer.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py index aa1f79448..804a07f88 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/copyer.py @@ -3,7 +3,7 @@ from torch import LongTensor class LimitBuffIndexCopyer(object): - """LimitBuffIndexCopyer + """LimitBuffIndexCopyer Index Copy using limited temp buffer on CUDA. Args: @@ -15,7 +15,7 @@ class LimitBuffIndexCopyer(object): @torch.no_grad() def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor): - """copy + """copy src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index] The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered. diff --git a/colossalai/nn/parallel/layers/cache_embedding/embedding_config.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py similarity index 100% rename from colossalai/nn/parallel/layers/cache_embedding/embedding_config.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/embedding_config.py diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py similarity index 96% rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py index d7f77e195..79d7672b2 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding.py @@ -1,12 +1,13 @@ +from typing import Iterator, List, Optional, Tuple + import torch import torch.nn.functional as F -from typing import List, Optional, Iterator, Tuple -from .cached_embedding import CachedEmbeddingBag -from colossalai.nn._ops._utils import dual_all_to_all +from colossalai.legacy.nn._ops._utils import dual_all_to_all +from colossalai.tensor import ColoParameter, ColoTensor, ColoTensorSpec, ComputePattern, ProcessGroup, ShardSpec -from colossalai.tensor import ColoParameter, ShardSpec, ComputePattern, ProcessGroup, ColoTensorSpec, ColoTensor from .cache_mgr import CachedParamMgr, EvictionStrategy +from .cached_embedding import CachedEmbeddingBag def get_partition(embedding_dim, rank, world_size) -> Tuple[int, int, bool]: diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py index 949f85ad4..116d836b7 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise.py @@ -1,15 +1,16 @@ +import time +from typing import List + import torch import torch.distributed as dist import torch.nn.functional as F -from .cached_embedding import CachedEmbeddingBag -from .cache_mgr import EvictionStrategy -from .embedding_config import TablewiseEmbeddingBagConfig +from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise from colossalai.tensor import ProcessGroup -from colossalai.nn._ops._utils import dual_all_to_all_tablewise -from typing import List -import time +from .cache_mgr import EvictionStrategy +from .cached_embedding import CachedEmbeddingBag +from .embedding_config import TablewiseEmbeddingBagConfig class ParallelCachedEmbeddingBagTablewise(CachedEmbeddingBag): diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py similarity index 99% rename from colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py rename to colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py index 80a54b4fa..0014c784f 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py +++ b/colossalai/legacy/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py @@ -1,17 +1,17 @@ +import abc +from typing import List + import torch import torch.distributed as dist import torch.nn as nn from torch.profiler import record_function -from .cached_embedding import CachedEmbeddingBag - +from colossalai.legacy.nn._ops._utils import dual_all_to_all_tablewise from colossalai.tensor import ProcessGroup -from colossalai.nn._ops._utils import dual_all_to_all_tablewise -from .embedding_config import TablewiseEmbeddingBagConfig -from .cache_mgr import EvictionStrategy -from typing import List -import abc +from .cache_mgr import EvictionStrategy +from .cached_embedding import CachedEmbeddingBag +from .embedding_config import TablewiseEmbeddingBagConfig class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module): diff --git a/colossalai/nn/parallel/layers/colo_module.py b/colossalai/legacy/nn/parallel/layers/colo_module.py similarity index 98% rename from colossalai/nn/parallel/layers/colo_module.py rename to colossalai/legacy/nn/parallel/layers/colo_module.py index 8f0f5d5f5..a0a3eb40c 100644 --- a/colossalai/nn/parallel/layers/colo_module.py +++ b/colossalai/legacy/nn/parallel/layers/colo_module.py @@ -1,6 +1,7 @@ -from colossalai.tensor.distspec import _DistSpec +from typing import Dict, List + from colossalai.tensor import ComputePattern -from typing import List, Dict +from colossalai.tensor.distspec import _DistSpec class ColoModule(object): diff --git a/colossalai/nn/parallel/layers/embedding.py b/colossalai/legacy/nn/parallel/layers/embedding.py similarity index 92% rename from colossalai/nn/parallel/layers/embedding.py rename to colossalai/legacy/nn/parallel/layers/embedding.py index ccacc1ead..3e4e7ffd8 100644 --- a/colossalai/nn/parallel/layers/embedding.py +++ b/colossalai/legacy/nn/parallel/layers/embedding.py @@ -1,5 +1,6 @@ +from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec + from .colo_module import ColoModule -from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec class ColoEmbedding(ColoModule): diff --git a/colossalai/nn/parallel/layers/linear.py b/colossalai/legacy/nn/parallel/layers/linear.py similarity index 93% rename from colossalai/nn/parallel/layers/linear.py rename to colossalai/legacy/nn/parallel/layers/linear.py index 84a8c0425..e391cf808 100644 --- a/colossalai/nn/parallel/layers/linear.py +++ b/colossalai/legacy/nn/parallel/layers/linear.py @@ -1,5 +1,6 @@ +from colossalai.tensor import ComputePattern, ProcessGroup, ShardSpec, distspec + from .colo_module import ColoModule -from colossalai.tensor import ComputePattern, distspec, ProcessGroup, ShardSpec class ColoLinear(ColoModule): diff --git a/colossalai/nn/parallel/layers/module_utils.py b/colossalai/legacy/nn/parallel/layers/module_utils.py similarity index 99% rename from colossalai/nn/parallel/layers/module_utils.py rename to colossalai/legacy/nn/parallel/layers/module_utils.py index 38d128cc7..191266fa7 100644 --- a/colossalai/nn/parallel/layers/module_utils.py +++ b/colossalai/legacy/nn/parallel/layers/module_utils.py @@ -1,9 +1,11 @@ from typing import Dict -from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup -from colossalai.tensor import distspec -from . import ColoModule + import torch +from colossalai.tensor import ColoParameter, ComputeSpec, ProcessGroup, distspec + +from . import ColoModule + _COLOSSAL_MODULES: Dict[type, ColoModule] = {} diff --git a/colossalai/nn/parallel/reducer.py b/colossalai/legacy/nn/parallel/reducer.py similarity index 100% rename from colossalai/nn/parallel/reducer.py rename to colossalai/legacy/nn/parallel/reducer.py diff --git a/colossalai/legacy/trainer/hooks/_metric_hook.py b/colossalai/legacy/trainer/hooks/_metric_hook.py index d0598c240..f1bd19387 100644 --- a/colossalai/legacy/trainer/hooks/_metric_hook.py +++ b/colossalai/legacy/trainer/hooks/_metric_hook.py @@ -7,9 +7,9 @@ from typing import Callable import torch import torch.distributed as dist -from colossalai.communication import all_reduce from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.communication import all_reduce from colossalai.legacy.registry import HOOKS from colossalai.utils import get_current_device, is_no_pp_or_last_stage diff --git a/colossalai/logging/logger.py b/colossalai/logging/logger.py index af7b7de54..f9abe4a2a 100644 --- a/colossalai/logging/logger.py +++ b/colossalai/logging/logger.py @@ -6,8 +6,7 @@ import logging from pathlib import Path from typing import List, Union -import colossalai -from colossalai.context.parallel_mode import ParallelMode +import torch.distributed as dist class DistributedLogger: @@ -63,6 +62,7 @@ class DistributedLogger: self._logger.propagate = False DistributedLogger.__instances[name] = self + self.rank = dist.get_rank() if dist.is_initialized() else 0 @staticmethod def __get_call_info(): @@ -109,16 +109,10 @@ class DistributedLogger: # create log directory path.mkdir(parents=True, exist_ok=True) - # set the default file name if path is a directory - if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL): - rank = 0 - else: - rank = colossalai.core.global_context.get_global_rank() - if suffix is not None: - log_file_name = f'rank_{rank}_{suffix}.log' + log_file_name = f'rank_{self.rank}_{suffix}.log' else: - log_file_name = f'rank_{rank}.log' + log_file_name = f'rank_{self.rank}.log' path = path.joinpath(log_file_name) # add file handler @@ -128,19 +122,14 @@ class DistributedLogger: file_handler.setFormatter(formatter) self._logger.addHandler(file_handler) - def _log(self, - level, - message: str, - parallel_mode: ParallelMode = ParallelMode.GLOBAL, - ranks: List[int] = None) -> None: + def _log(self, level, message: str, ranks: List[int] = None) -> None: if ranks is None: getattr(self._logger, level)(message) else: - local_rank = colossalai.core.global_context.get_local_rank(parallel_mode) - if local_rank in ranks: + if self.rank in ranks: getattr(self._logger, level)(message) - def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def info(self, message: str, ranks: List[int] = None) -> None: """Log an info message. Args: @@ -150,10 +139,10 @@ class DistributedLogger: ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('info', message_prefix, parallel_mode, ranks) - self._log('info', message, parallel_mode, ranks) + self._log('info', message_prefix, ranks) + self._log('info', message, ranks) - def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def warning(self, message: str, ranks: List[int] = None) -> None: """Log a warning message. Args: @@ -163,10 +152,10 @@ class DistributedLogger: ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('warning', message_prefix, parallel_mode, ranks) - self._log('warning', message, parallel_mode, ranks) + self._log('warning', message_prefix, ranks) + self._log('warning', message, ranks) - def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def debug(self, message: str, ranks: List[int] = None) -> None: """Log a debug message. Args: @@ -176,10 +165,10 @@ class DistributedLogger: ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('debug', message_prefix, parallel_mode, ranks) - self._log('debug', message, parallel_mode, ranks) + self._log('debug', message_prefix, ranks) + self._log('debug', message, ranks) - def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: List[int] = None) -> None: + def error(self, message: str, ranks: List[int] = None) -> None: """Log an error message. Args: @@ -189,5 +178,5 @@ class DistributedLogger: ranks (List[int]): List of parallel ranks. """ message_prefix = "{}:{} {}".format(*self.__get_call_info()) - self._log('error', message_prefix, parallel_mode, ranks) - self._log('error', message, parallel_mode, ranks) + self._log('error', message_prefix, ranks) + self._log('error', message, ranks) diff --git a/colossalai/nn/__init__.py b/colossalai/nn/__init__.py index 910ad2031..c6c4d3042 100644 --- a/colossalai/nn/__init__.py +++ b/colossalai/nn/__init__.py @@ -1,6 +1,5 @@ -from ._ops import * +from .init import * from .layer import * from .loss import * from .lr_scheduler import * -from .metric import * from .optimizer import * diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py index b705632f8..edd986ef5 100644 --- a/colossalai/nn/layer/__init__.py +++ b/colossalai/nn/layer/__init__.py @@ -1,10 +1,2 @@ -from .colossalai_layer import * -from .parallel_1d import * -from .parallel_2d import * -from .parallel_2p5d import * -from .parallel_3d import * -from .parallel_sequence import * from .moe import * from .utils import * -from .vanilla import * -from .wrapper import * diff --git a/colossalai/nn/layer/parallel_1d/__init__.py b/colossalai/nn/layer/parallel_1d/__init__.py deleted file mode 100644 index 2353851df..000000000 --- a/colossalai/nn/layer/parallel_1d/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row, - PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D) - -__all__ = [ - 'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D', - 'VocabParallelEmbedding1D', 'LayerNorm1D', 'PatchEmbedding1D' -] diff --git a/colossalai/nn/layer/utils.py b/colossalai/nn/layer/utils.py new file mode 100644 index 000000000..dc12ff8da --- /dev/null +++ b/colossalai/nn/layer/utils.py @@ -0,0 +1,14 @@ +def divide(numerator, denominator): + """Only allow exact division. + + Args: + numerator (int): Numerator of the division. + denominator (int): Denominator of the division. + + Returns: + int: the result of exact division. + """ + assert denominator != 0, 'denominator can not be zero' + assert numerator % denominator == 0, \ + '{} is not divisible by {}'.format(numerator, denominator) + return numerator // denominator diff --git a/colossalai/nn/layer/utils/__init__.py b/colossalai/nn/layer/utils/__init__.py deleted file mode 100644 index 7e999ee82..000000000 --- a/colossalai/nn/layer/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode, - set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple) - -__all__ = [ - 'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size', - 'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple' -] diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py index 373e4ec94..ee2add48a 100644 --- a/colossalai/nn/loss/__init__.py +++ b/colossalai/nn/loss/__init__.py @@ -1,41 +1 @@ -from colossalai.global_variables import tensor_parallel_env as env -from colossalai.nn.layer.utils import get_tensor_parallel_mode -from torch import nn -from torch.nn.modules.loss import * -from torch.nn.modules.loss import _Loss - -from .loss_1d import VocabParallelCrossEntropyLoss1D -from .loss_2d import CrossEntropyLoss2D, VocabParallelCrossEntropyLoss2D -from .loss_2p5d import CrossEntropyLoss2p5D, VocabParallelCrossEntropyLoss2p5D -from .loss_3d import CrossEntropyLoss3D, VocabParallelCrossEntropyLoss3D from .loss_moe import MoeCrossEntropyLoss, MoeLoss - -_parallel_cross_entropy = { - '2d': CrossEntropyLoss2D, - '2.5d': CrossEntropyLoss2p5D, - '3d': CrossEntropyLoss3D, -} - -_vocab_parallel_cross_entropy = { - '1d': VocabParallelCrossEntropyLoss1D, - '2d': VocabParallelCrossEntropyLoss2D, - '2.5d': VocabParallelCrossEntropyLoss2p5D, - '3d': VocabParallelCrossEntropyLoss3D, -} - - -class CrossEntropyLoss(_Loss): - - def __init__(self, reduction: bool = True, *args, **kwargs): - super().__init__() - tensor_parallel = get_tensor_parallel_mode() - if tensor_parallel is not None and env.vocab_parallel: - self.loss = _vocab_parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) - elif tensor_parallel is None or tensor_parallel == '1d': - reduction = 'mean' if reduction else 'none' - self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs) - else: - self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs) - - def forward(self, *args): - return self.loss(*args) diff --git a/colossalai/nn/lr_scheduler/cosine.py b/colossalai/nn/lr_scheduler/cosine.py index 0010435c2..fb587e1a1 100644 --- a/colossalai/nn/lr_scheduler/cosine.py +++ b/colossalai/nn/lr_scheduler/cosine.py @@ -1,11 +1,8 @@ from torch.optim.lr_scheduler import CosineAnnealingLR as _CosineAnnealingLR -from colossalai.legacy.registry import LR_SCHEDULERS - from .delayed import DelayerScheduler, WarmupDelayerScheduler, WarmupScheduler -@LR_SCHEDULERS.register_module class CosineAnnealingLR(_CosineAnnealingLR): r"""Set the learning rate of each parameter group using a cosine annealing schedule, where :math:`\eta_{max}` is set to the initial lr and @@ -49,7 +46,6 @@ class CosineAnnealingLR(_CosineAnnealingLR): super().__init__(optimizer, total_steps, eta_min=eta_min, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class CosineAnnealingWarmupLR(WarmupScheduler): """Cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied. @@ -70,7 +66,6 @@ class CosineAnnealingWarmupLR(WarmupScheduler): super().__init__(optimizer, warmup_steps, base_scheduler) -@LR_SCHEDULERS.register_module class FlatAnnealingLR(DelayerScheduler): """Flat and cosine annealing learning rate scheduler. The learning rate will be a fixed value before starting decay. @@ -91,7 +86,6 @@ class FlatAnnealingLR(DelayerScheduler): super().__init__(optimizer, flat_steps, base_scheduler, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class FlatAnnealingWarmupLR(WarmupDelayerScheduler): """Flat and cosine annealing learning rate scheduler with learning rate warmup. A linear warmup schedule will be applied, and then the learning rate will be a fixed value before starting decay. diff --git a/colossalai/nn/lr_scheduler/linear.py b/colossalai/nn/lr_scheduler/linear.py index 251779647..21a865e4c 100644 --- a/colossalai/nn/lr_scheduler/linear.py +++ b/colossalai/nn/lr_scheduler/linear.py @@ -1,9 +1,6 @@ from torch.optim.lr_scheduler import _LRScheduler -from colossalai.legacy.registry import LR_SCHEDULERS - -@LR_SCHEDULERS.register_module class LinearWarmupLR(_LRScheduler): """Linearly warmup learning rate and then linearly decay. diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py index 4f18b49fc..c428c911c 100644 --- a/colossalai/nn/lr_scheduler/multistep.py +++ b/colossalai/nn/lr_scheduler/multistep.py @@ -2,12 +2,9 @@ from typing import List from torch.optim.lr_scheduler import MultiStepLR as _MultiStepLR -from colossalai.legacy.registry import LR_SCHEDULERS - from .delayed import WarmupScheduler -@LR_SCHEDULERS.register_module class MultiStepLR(_MultiStepLR): """Decays the learning rate of each parameter group by gamma once the number of epoch reaches one of the milestones. Notice that such decay can @@ -33,7 +30,6 @@ class MultiStepLR(_MultiStepLR): super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class MultiStepWarmupLR(WarmupScheduler): """Multistep learning rate scheduler with warmup. diff --git a/colossalai/nn/lr_scheduler/onecycle.py b/colossalai/nn/lr_scheduler/onecycle.py index 20e9aaec6..6835b3ee1 100644 --- a/colossalai/nn/lr_scheduler/onecycle.py +++ b/colossalai/nn/lr_scheduler/onecycle.py @@ -1,9 +1,6 @@ from torch.optim.lr_scheduler import OneCycleLR as _OneCycleLR -from colossalai.legacy.registry import LR_SCHEDULERS - -@LR_SCHEDULERS.register_module class OneCycleLR(_OneCycleLR): r"""Sets the learning rate of each parameter group according to the 1cycle learning rate policy. The 1cycle policy anneals the learning diff --git a/colossalai/nn/lr_scheduler/poly.py b/colossalai/nn/lr_scheduler/poly.py index a98506423..4f2249720 100644 --- a/colossalai/nn/lr_scheduler/poly.py +++ b/colossalai/nn/lr_scheduler/poly.py @@ -1,11 +1,8 @@ from torch.optim.lr_scheduler import _LRScheduler -from colossalai.legacy.registry import LR_SCHEDULERS - from .delayed import WarmupScheduler -@LR_SCHEDULERS.register_module class PolynomialLR(_LRScheduler): """Polynomial learning rate scheduler. @@ -41,7 +38,6 @@ class PolynomialLR(_LRScheduler): for base_lr in self.base_lrs] -@LR_SCHEDULERS.register_module class PolynomialWarmupLR(WarmupScheduler): """Polynomial learning rate scheduler with warmup. diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py index 09f5d4585..8846e13c7 100644 --- a/colossalai/nn/lr_scheduler/torch.py +++ b/colossalai/nn/lr_scheduler/torch.py @@ -3,10 +3,7 @@ from torch.optim.lr_scheduler import LambdaLR as _LambdaLR from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR from torch.optim.lr_scheduler import StepLR as _StepLR -from colossalai.legacy.registry import LR_SCHEDULERS - -@LR_SCHEDULERS.register_module class LambdaLR(_LambdaLR): """Sets the learning rate of each parameter group to the initial lr times a given function. When last_epoch=-1, sets initial lr as lr. @@ -24,7 +21,6 @@ class LambdaLR(_LambdaLR): super().__init__(optimizer, lr_lambda, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class MultiplicativeLR(_MultiplicativeLR): """Multiply the learning rate of each parameter group by the factor given in the specified function. When last_epoch=-1, sets initial lr as lr. @@ -42,7 +38,6 @@ class MultiplicativeLR(_MultiplicativeLR): super().__init__(optimizer, lr_lambda, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class StepLR(_StepLR): """Decays the learning rate of each parameter group by gamma every step_size epochs. Notice that such decay can happen simultaneously with @@ -61,7 +56,6 @@ class StepLR(_StepLR): super().__init__(optimizer, step_size, gamma=gamma, last_epoch=last_epoch) -@LR_SCHEDULERS.register_module class ExponentialLR(_ExponentialLR): """Decays the learning rate of each parameter group by gamma every epoch. When last_epoch=-1, sets initial lr as lr diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 210400a21..9767fcb8b 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -4,12 +4,10 @@ from typing import Optional import torch from colossalai.kernel.op_builder import CPUAdamBuilder -from colossalai.legacy.registry import OPTIMIZERS from .nvme_optimizer import NVMeOptimizer -@OPTIMIZERS.register_module class CPUAdam(NVMeOptimizer): """Implements Adam algorithm. diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index 0d13873cd..3a05a34f5 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -8,11 +8,9 @@ Licensed under the MIT License. ''' import torch -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier -@OPTIMIZERS.register_module class FusedAdam(torch.optim.Optimizer): """Implements Adam algorithm. diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py index 48cc097c7..a2807d70f 100644 --- a/colossalai/nn/optimizer/fused_lamb.py +++ b/colossalai/nn/optimizer/fused_lamb.py @@ -1,11 +1,9 @@ # modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py import torch -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier -@OPTIMIZERS.register_module class FusedLAMB(torch.optim.Optimizer): """Implements LAMB algorithm. diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py index 0e8d3fc10..59a93a8be 100644 --- a/colossalai/nn/optimizer/fused_sgd.py +++ b/colossalai/nn/optimizer/fused_sgd.py @@ -2,11 +2,9 @@ import torch from torch.optim.optimizer import Optimizer, required -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier -@OPTIMIZERS.register_module class FusedSGD(Optimizer): r"""Implements stochastic gradient descent (optionally with momentum). diff --git a/colossalai/nn/optimizer/hybrid_adam.py b/colossalai/nn/optimizer/hybrid_adam.py index 7aa0ced18..e08df410e 100644 --- a/colossalai/nn/optimizer/hybrid_adam.py +++ b/colossalai/nn/optimizer/hybrid_adam.py @@ -4,13 +4,11 @@ import torch from torch.optim import Adam from colossalai.kernel.op_builder import FusedOptimBuilder -from colossalai.legacy.registry import OPTIMIZERS from colossalai.utils import multi_tensor_applier from .cpu_adam import CPUAdam -@OPTIMIZERS.register_module class HybridAdam(CPUAdam): """Implements Adam algorithm. diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py index 769c11f62..d5de267f7 100644 --- a/colossalai/nn/optimizer/lamb.py +++ b/colossalai/nn/optimizer/lamb.py @@ -5,10 +5,7 @@ Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch- import torch from torch.optim import Optimizer -from colossalai.legacy.registry import OPTIMIZERS - -@OPTIMIZERS.register_module class Lamb(Optimizer): r"""Implements Lamb algorithm. It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. diff --git a/colossalai/nn/optimizer/lars.py b/colossalai/nn/optimizer/lars.py index 9dbb83b84..58393fdae 100644 --- a/colossalai/nn/optimizer/lars.py +++ b/colossalai/nn/optimizer/lars.py @@ -5,10 +5,7 @@ from typing import Iterable import torch from torch.optim import Optimizer -from colossalai.legacy.registry import OPTIMIZERS - -@OPTIMIZERS.register_module class Lars(Optimizer): r"""Implements the LARS optimizer from `"Large batch training of convolutional networks" <https://arxiv.org/pdf/1708.03888.pdf>`_. diff --git a/colossalai/pipeline/pipelinable.py b/colossalai/pipeline/pipelinable.py index 79913987b..ba8b1591d 100644 --- a/colossalai/pipeline/pipelinable.py +++ b/colossalai/pipeline/pipelinable.py @@ -1,15 +1,24 @@ -import torch import inspect + +import torch + +from colossalai.context import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.utils import CheckpointModule +from colossalai.tensor import ColoParameter from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses -from .utils import partition_uniform, partition_balanced, build_kwargs_for_function, \ - build_kwargs_for_module, exec_func_with_kwargs, exec_funcs_with_kwargs, \ - call_module, customized_partition -from colossalai.nn.layer.utils import CheckpointModule -from colossalai.tensor import ColoParameter -from colossalai.core import global_context as gpc -from colossalai.context import ParallelMode from .layer_spec import LayerSpec +from .utils import ( + build_kwargs_for_function, + build_kwargs_for_module, + call_module, + customized_partition, + exec_func_with_kwargs, + exec_funcs_with_kwargs, + partition_balanced, + partition_uniform, +) class PipelinableContext(InsertPostInitMethodToModuleSubClasses): diff --git a/colossalai/pipeline/utils.py b/colossalai/pipeline/utils.py index ac8a3ad7d..be8428692 100644 --- a/colossalai/pipeline/utils.py +++ b/colossalai/pipeline/utils.py @@ -1,12 +1,13 @@ import heapq import inspect -import torch - -from colossalai.logging import get_dist_logger -from colossalai.nn.layer.utils import CheckpointModule +from collections import OrderedDict from typing import List -from collections import OrderedDict +import torch + +from colossalai.legacy.nn.layer.utils import CheckpointModule +from colossalai.logging import get_dist_logger + def _binary_partition(weights: List, start: int, end: int): """Returns the binary partition position of `weights`, given the start @@ -162,7 +163,7 @@ def build_kwargs_for_module(function, input_tensor, kw_dict): kwargs_offset = 1 elif isinstance(input_tensor, (tuple, OrderedDict)): #assert isinstance(input_tensor, tuple), f'input_tensor should be a torch.Tensor or a tuple object.' - # Huggingface will take their own structures based on OrderedDict as the output + # Huggingface will take their own structures based on OrderedDict as the output # between layers so we've to close this check. kwargs_offset = len(input_tensor) args_name_list = list(sig.parameters.keys()) @@ -256,7 +257,7 @@ def call_module(module, args=None, kwargs=None): def customized_partition(exec_seq): ''' - This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an + This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an annotation to note the partition point. ''' customized_parts = {} diff --git a/colossalai/tensor/dist_spec_mgr.py b/colossalai/tensor/dist_spec_mgr.py index c968050de..4740a316b 100644 --- a/colossalai/tensor/dist_spec_mgr.py +++ b/colossalai/tensor/dist_spec_mgr.py @@ -2,7 +2,6 @@ from contextlib import contextmanager import torch import torch.distributed as dist -# from colossalai.nn.layer.utils import divide from numpy import prod from colossalai.tensor.distspec import DistPlacementPattern, _DistSpec diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py index 7b2e8480c..6f9717d35 100644 --- a/colossalai/utils/__init__.py +++ b/colossalai/utils/__init__.py @@ -1,12 +1,14 @@ from .activation_checkpoint import checkpoint from .checkpointing import load_checkpoint, save_checkpoint from .common import ( + _cast_float, clip_grad_norm_fp32, conditional_context, copy_tensor_parallel_attributes, count_zeros_fp32, disposable, ensure_path_exists, + free_storage, is_ddp_ignored, is_dp_rank_0, is_model_parallel_parameter, @@ -72,4 +74,6 @@ __all__ = [ 'disposable', 'colo_set_cpu_memory_capacity', 'colo_get_cpu_memory_capacity', + '_cast_float', + 'free_storage', ] diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py index 8022e84dc..998901708 100644 --- a/colossalai/utils/common.py +++ b/colossalai/utils/common.py @@ -470,3 +470,22 @@ def disposable(func: Callable) -> Callable: return func(*args, **kwargs) return wrapper + + +def free_storage(data: torch.Tensor) -> None: + """Free underlying storage of a Tensor.""" + if data.storage().size() > 0: + # Since we're modifying the Tensor's Storage directly, make sure the Tensor + # is the sole occupant of the Storage. + assert data.storage_offset() == 0 + data.storage().resize_(0) + + +def _cast_float(args, dtype: torch.dtype): + if isinstance(args, torch.Tensor) and torch.is_floating_point(args): + args = args.to(dtype) + elif isinstance(args, (list, tuple)): + args = type(args)(_cast_float(t, dtype) for t in args) + elif isinstance(args, dict): + args = {k: _cast_float(v, dtype) for k, v in args.items()} + return args diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py index 4ca7bce7b..881ddde78 100644 --- a/colossalai/utils/data_sampler/data_parallel_sampler.py +++ b/colossalai/utils/data_sampler/data_parallel_sampler.py @@ -12,12 +12,10 @@ from torch.utils.data import DataLoader, Dataset, Sampler from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.legacy.registry import DATA_SAMPLERS T_co = TypeVar('T_co', covariant=True) -@DATA_SAMPLERS.register_module class DataParallelSampler(Sampler): """A data sampler for distributed data parallelism. diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/zero/gemini/colo_init_context.py index 75f8576ca..dad852a34 100644 --- a/colossalai/zero/gemini/colo_init_context.py +++ b/colossalai/zero/gemini/colo_init_context.py @@ -87,7 +87,7 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses): self._default_dist_spec = default_dist_spec def _register_colo_modules(self): - from colossalai.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module + from colossalai.legacy.nn.parallel.layers import ColoEmbedding, ColoLinear, register_colo_module register_colo_module(torch.nn.Linear, ColoLinear()) register_colo_module(torch.nn.Embedding, ColoEmbedding()) diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 741a977d1..918b08cd3 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -10,15 +10,13 @@ import torch.nn as nn from torch.distributed import ProcessGroup from torch.distributed.distributed_c10d import _get_default_group -from colossalai.checkpoint_io.utils import calculate_tensor_size, StateDictSharder +from colossalai.checkpoint_io.utils import StateDictSharder, calculate_tensor_size from colossalai.interface import ModelWrapper - from colossalai.lazy import LazyTensor from colossalai.logging import get_dist_logger -from colossalai.nn.parallel.data_parallel import _cast_float, free_storage from colossalai.tensor.colo_parameter import ColoParameter from colossalai.tensor.param_op_hook import ColoParamOpHookManager -from colossalai.utils import get_current_device, is_ddp_ignored +from colossalai.utils import _cast_float, free_storage, get_current_device, is_ddp_ignored from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager from .gemini_hook import GeminiZeROHook @@ -780,5 +778,3 @@ class GeminiDDP(ModelWrapper): yield block, block_size yield sharder.current_block, sharder.current_block_size - - diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py index 0c9eac8b6..e5466965c 100644 --- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py +++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py @@ -1,7 +1,7 @@ import torch.nn -from colossalai.nn.parallel.data_parallel import _cast_float from colossalai.tensor.param_op_hook import ColoParamOpHookManager +from colossalai.utils import _cast_float from colossalai.zero.legacy.gemini.ophooks.runtime_mem_tracer_hook import ( GradMemStats, GradMemTracerHook, diff --git a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md index 281fd4755..0a94a7f5d 100644 --- a/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/en/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -176,7 +176,7 @@ In our latest example, a Gemini + ZeRO DDP model is also defined to reduce overh ```python def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): - from colossalai.nn.parallel import GeminiDDP + from colossalai.zero import GeminiDDP model = GeminiDDP(model, device=get_current_device(), placement_policy=placement_policy, diff --git a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md index 5aa806c64..36c94fb49 100644 --- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md @@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils.timer import MultiTimer from model_zoo.gpt import GPTLMLoss diff --git a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 22022639c..0ec9d5c3c 100644 --- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -78,7 +78,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.lr_scheduler import LinearWarmupLR -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks ``` diff --git a/docs/source/en/basics/engine_trainer.md b/docs/source/en/basics/engine_trainer.md index 6d2355ad9..e17c37e24 100644 --- a/docs/source/en/basics/engine_trainer.md +++ b/docs/source/en/basics/engine_trainer.md @@ -344,7 +344,7 @@ for epoch in range(gpc.config.NUM_EPOCHS): If you wish to train with a trainer object, you can follow the code snippet below: ```python -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks diff --git a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md index 3f85d5045..dfd1e2910 100644 --- a/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md +++ b/docs/source/zh-Hans/advanced_tutorials/parallelize_your_training_like_Megatron.md @@ -160,7 +160,7 @@ for mn, module in model.named_modules(): ```python def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"): - from colossalai.nn.parallel import GeminiDDP + from colossalai.zero import GeminiDDP model = GeminiDDP(model, device=get_current_device(), placement_policy=placement_policy, diff --git a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md index 9cfbf5873..3f57f39f2 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md @@ -42,7 +42,7 @@ from colossalai.core import global_context as gpc from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule) from colossalai.logging import disable_existing_loggers, get_dist_logger -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.legacy.trainer import Trainer, hooks from colossalai.utils.timer import MultiTimer from model_zoo.gpt import GPTLMLoss diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md index 803882a5a..f7dd8d477 100644 --- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md +++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md @@ -73,7 +73,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.logging import disable_existing_loggers, get_dist_logger from colossalai.nn.lr_scheduler import LinearWarmupLR -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks ``` diff --git a/docs/source/zh-Hans/basics/engine_trainer.md b/docs/source/zh-Hans/basics/engine_trainer.md index e57220292..ed5100299 100644 --- a/docs/source/zh-Hans/basics/engine_trainer.md +++ b/docs/source/zh-Hans/basics/engine_trainer.md @@ -340,7 +340,7 @@ for epoch in range(gpc.config.NUM_EPOCHS): ```python -from colossalai.nn.metric import Accuracy +from colossalai.legacy.nn.metric import Accuracy from colossalai.legacy.trainer import Trainer, hooks diff --git a/examples/language/gpt/titans/model/embed.py b/examples/language/gpt/titans/model/embed.py index 668992901..e521193a9 100644 --- a/examples/language/gpt/titans/model/embed.py +++ b/examples/language/gpt/titans/model/embed.py @@ -8,11 +8,11 @@ from torch.nn.parameter import Parameter from colossalai.context import ParallelMode, seed from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.base_layer import ParallelLayer +from colossalai.legacy.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input +from colossalai.legacy.nn.layer.parallel_1d.layers import Linear1D_Row +from colossalai.legacy.nn.layer.utils import divide from colossalai.legacy.registry import LAYERS, LOSSES, MODELS -from colossalai.nn.layer.base_layer import ParallelLayer -from colossalai.nn.layer.parallel_1d._utils import gather_forward_split_backward, reduce_grad, reduce_input -from colossalai.nn.layer.parallel_1d.layers import Linear1D_Row -from colossalai.nn.layer.utils import divide from colossalai.utils import get_current_device diff --git a/examples/language/gpt/titans/model/gpt1d.py b/examples/language/gpt/titans/model/gpt1d.py index 2edd03606..72297c540 100644 --- a/examples/language/gpt/titans/model/gpt1d.py +++ b/examples/language/gpt/titans/model/gpt1d.py @@ -11,9 +11,9 @@ from colossalai import kernel from colossalai import nn as col_nn from colossalai.core import global_context as gpc from colossalai.kernel.cuda_native.scaled_softmax import AttnMaskType -from colossalai.nn.layer import Linear1D_Col, Linear1D_Row -from colossalai.nn.layer.base_layer import ParallelLayer -from colossalai.nn.layer.utils import ACT2FN, divide +from colossalai.legacy.nn.layer import Linear1D_Col, Linear1D_Row +from colossalai.legacy.nn.layer.base_layer import ParallelLayer +from colossalai.legacy.nn.layer.utils import ACT2FN, divide from colossalai.utils import checkpoint from colossalai.utils.activation_checkpoint import checkpoint diff --git a/examples/language/gpt/titans/model/pipeline_gpt1d.py b/examples/language/gpt/titans/model/pipeline_gpt1d.py index 30180285b..9b22d156b 100644 --- a/examples/language/gpt/titans/model/pipeline_gpt1d.py +++ b/examples/language/gpt/titans/model/pipeline_gpt1d.py @@ -9,8 +9,8 @@ from colossalai import kernel from colossalai import nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.logging import get_dist_logger -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.pipeline.utils import partition_uniform from .embed import HiddenParallelEmbedding, HiddenParallelGPTLMHead1D, VocabParallelEmbedding, VocabParallelGPTLMHead1D diff --git a/examples/tutorial/hybrid_parallel/test_ci.sh b/examples/tutorial/hybrid_parallel/test_ci.sh index e0dbef354..24cee1da3 100644 --- a/examples/tutorial/hybrid_parallel/test_ci.sh +++ b/examples/tutorial/hybrid_parallel/test_ci.sh @@ -1,5 +1,7 @@ #!/bin/bash set -euxo pipefail -pip install -r requirements.txt -colossalai run --nproc_per_node 4 train.py --config config.py +echo "legacy example" + +# pip install -r requirements.txt +# colossalai run --nproc_per_node 4 train.py --config config.py diff --git a/examples/tutorial/hybrid_parallel/train.py b/examples/tutorial/hybrid_parallel/train.py index 4953d5350..12cdec902 100644 --- a/examples/tutorial/hybrid_parallel/train.py +++ b/examples/tutorial/hybrid_parallel/train.py @@ -7,8 +7,8 @@ from tqdm import tqdm import colossalai from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn import CrossEntropyLoss from colossalai.logging import get_dist_logger -from colossalai.nn import CrossEntropyLoss from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR from colossalai.pipeline.pipelinable import PipelinableContext from colossalai.utils import is_using_pp diff --git a/examples/tutorial/sequence_parallel/model/bert.py b/examples/tutorial/sequence_parallel/model/bert.py index 049579c5a..b8adb501f 100644 --- a/examples/tutorial/sequence_parallel/model/bert.py +++ b/examples/tutorial/sequence_parallel/model/bert.py @@ -1,33 +1,37 @@ -from colossalai.context.parallel_mode import ParallelMode +import inspect + import torch import torch.nn as nn -import inspect -from .layers import Embedding, BertLayer, BertDualHead, PreProcessor, VocabEmbedding -from .layers.init_method import init_normal, output_init_normal -from colossalai.core import global_context as gpc + from colossalai.context import ParallelMode +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc from colossalai.kernel import LayerNorm -from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper +from colossalai.legacy.nn.layer.wrapper import PipelineSharedModuleWrapper from colossalai.logging import get_dist_logger from colossalai.pipeline.utils import partition_uniform +from .layers import BertDualHead, BertLayer, Embedding, PreProcessor, VocabEmbedding +from .layers.init_method import init_normal, output_init_normal + class BertForPretrain(nn.Module): - def __init__(self, - vocab_size, - hidden_size, - max_sequence_length, - num_attention_heads, - num_layers, - add_binary_head, - is_naive_fp16, - num_tokentypes=2, - dropout_prob=0.1, - mlp_ratio=4, - init_std=0.02, - convert_fp16_to_fp32_in_softmax=False, - ): + def __init__( + self, + vocab_size, + hidden_size, + max_sequence_length, + num_attention_heads, + num_layers, + add_binary_head, + is_naive_fp16, + num_tokentypes=2, + dropout_prob=0.1, + mlp_ratio=4, + init_std=0.02, + convert_fp16_to_fp32_in_softmax=False, + ): super().__init__() self.seq_parallel_size = gpc.get_world_size(ParallelMode.SEQUENCE) assert max_sequence_length % self.seq_parallel_size == 0, 'sequence length is not divisible by the sequence parallel size' @@ -47,19 +51,19 @@ class BertForPretrain(nn.Module): self.bert_layers = nn.ModuleList() for i in range(num_layers): - bert_layer = BertLayer(layer_number=i+1, + bert_layer = BertLayer(layer_number=i + 1, hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_dropout=dropout_prob, mlp_ratio=mlp_ratio, hidden_dropout=dropout_prob, convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax, - is_naive_fp16=is_naive_fp16 - ) + is_naive_fp16=is_naive_fp16) self.bert_layers.append(bert_layer) self.layer_norm = LayerNorm(hidden_size) - self.head = BertDualHead(hidden_size, self.embedding.word_embedding_weight.size(0), + self.head = BertDualHead(hidden_size, + self.embedding.word_embedding_weight.size(0), add_binary_head=add_binary_head) self.reset_parameters() @@ -166,22 +170,20 @@ class PipelineBertForPretrain(nn.Module): end_idx = num_layers for i in range(start_idx, end_idx): - bert_layer = BertLayer(layer_number=i+1, + bert_layer = BertLayer(layer_number=i + 1, hidden_size=hidden_size, num_attention_heads=num_attention_heads, attention_dropout=dropout_prob, mlp_ratio=mlp_ratio, hidden_dropout=dropout_prob, convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax, - is_naive_fp16=is_naive_fp16 - ) + is_naive_fp16=is_naive_fp16) self.bert_layers.append(bert_layer) if self.last_stage: self.word_embeddings = VocabEmbedding(vocab_size, hidden_size) self.layer_norm = LayerNorm(hidden_size) - self.head = BertDualHead(hidden_size, vocab_size, - add_binary_head=add_binary_head) + self.head = BertDualHead(hidden_size, vocab_size, add_binary_head=add_binary_head) self.reset_parameters() def _init_normal(self, tensor): diff --git a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py index 4ede21516..56ba511d8 100644 --- a/examples/tutorial/sequence_parallel/model/layers/bert_layer.py +++ b/examples/tutorial/sequence_parallel/model/layers/bert_layer.py @@ -1,10 +1,12 @@ import torch import torch.nn as nn -from colossalai.nn.layer.parallel_sequence import TransformerSelfAttentionRing -from colossalai.kernel.jit import bias_dropout_add_fused_train, bias_dropout_add_fused_inference + from colossalai.kernel.cuda_native import LayerNorm -from .mlp import TransformerMLP +from colossalai.kernel.jit import bias_dropout_add_fused_inference, bias_dropout_add_fused_train +from colossalai.legacy.nn.layer.parallel_sequence import TransformerSelfAttentionRing + from .dropout import get_bias_dropout_add +from .mlp import TransformerMLP def attention_mask_func(attention_scores, attention_mask): @@ -48,8 +50,7 @@ class BertLayer(nn.Module): layer_number=layer_number, apply_query_key_layer_scaling=True, convert_fp16_to_fp32_in_softmax=convert_fp16_to_fp32_in_softmax, - fp16=is_naive_fp16 - ) + fp16=is_naive_fp16) self.hidden_dropout = hidden_dropout self.bias_dropout_fusion = bias_dropout_fusion @@ -89,11 +90,8 @@ class BertLayer(nn.Module): # re-enable torch grad to enable fused optimization. with torch.enable_grad(): - layernorm_input = bias_dropout_add_func( - attention_output, - attention_bias.expand_as(residual), - residual, - self.hidden_dropout) + layernorm_input = bias_dropout_add_func(attention_output, attention_bias.expand_as(residual), residual, + self.hidden_dropout) # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) @@ -109,10 +107,6 @@ class BertLayer(nn.Module): # re-enable torch grad to enable fused optimization. with torch.enable_grad(): - output = bias_dropout_add_func( - mlp_output, - mlp_bias.expand_as(residual), - residual, - self.hidden_dropout) + output = bias_dropout_add_func(mlp_output, mlp_bias.expand_as(residual), residual, self.hidden_dropout) return output diff --git a/tests/components_to_test/hanging_param_model.py b/tests/components_to_test/hanging_param_model.py index 329a08ea2..0e6543121 100644 --- a/tests/components_to_test/hanging_param_model.py +++ b/tests/components_to_test/hanging_param_model.py @@ -2,7 +2,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils.dummy_data_generator import DummyDataGenerator diff --git a/tests/components_to_test/inline_op_model.py b/tests/components_to_test/inline_op_model.py index f061d48f9..80757f361 100644 --- a/tests/components_to_test/inline_op_model.py +++ b/tests/components_to_test/inline_op_model.py @@ -2,7 +2,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils.dummy_data_generator import DummyDataGenerator diff --git a/tests/components_to_test/nested_model.py b/tests/components_to_test/nested_model.py index 339084639..3e779b0a6 100644 --- a/tests/components_to_test/nested_model.py +++ b/tests/components_to_test/nested_model.py @@ -2,7 +2,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils import DummyDataGenerator diff --git a/tests/components_to_test/repeated_computed_layers.py b/tests/components_to_test/repeated_computed_layers.py index b3f84bd0e..c1ef99aa0 100644 --- a/tests/components_to_test/repeated_computed_layers.py +++ b/tests/components_to_test/repeated_computed_layers.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from .registry import non_distributed_component_funcs from .utils.dummy_data_generator import DummyDataGenerator diff --git a/tests/components_to_test/simple_net.py b/tests/components_to_test/simple_net.py index cd9d7ebc0..064974a15 100644 --- a/tests/components_to_test/simple_net.py +++ b/tests/components_to_test/simple_net.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn -from colossalai.nn import CheckpointModule +from colossalai.legacy.nn import CheckpointModule from colossalai.utils.cuda import get_current_device from .registry import non_distributed_component_funcs diff --git a/tests/test_comm/test_boardcast_send_recv_v2.py b/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py similarity index 93% rename from tests/test_comm/test_boardcast_send_recv_v2.py rename to tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py index 253f6f21c..c5fb049fe 100644 --- a/tests/test_comm/test_boardcast_send_recv_v2.py +++ b/tests/test_legacy/test_comm/test_boardcast_send_recv_v2.py @@ -1,10 +1,10 @@ import pytest import torch -from colossalai.communication.p2p_v2 import _recv_object, _send_object from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch +from colossalai.legacy.communication.p2p_v2 import _recv_object, _send_object from colossalai.logging import disable_existing_loggers from colossalai.testing import rerun_if_address_is_in_use, spawn diff --git a/tests/test_comm/test_comm.py b/tests/test_legacy/test_comm/test_comm.py similarity index 96% rename from tests/test_comm/test_comm.py rename to tests/test_legacy/test_comm/test_comm.py index 747596bd2..3251d8d46 100644 --- a/tests/test_comm/test_comm.py +++ b/tests/test_legacy/test_comm/test_comm.py @@ -2,10 +2,10 @@ import pytest import torch import torch.distributed as dist -from colossalai.communication import all_gather, all_reduce, reduce_scatter from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch +from colossalai.legacy.communication import all_gather, all_reduce, reduce_scatter from colossalai.testing import rerun_if_address_is_in_use, spawn from colossalai.utils import get_current_device diff --git a/tests/test_comm/test_object_list_p2p.py b/tests/test_legacy/test_comm/test_object_list_p2p.py similarity index 98% rename from tests/test_comm/test_object_list_p2p.py rename to tests/test_legacy/test_comm/test_object_list_p2p.py index e9d7630c1..f50982ee1 100644 --- a/tests/test_comm/test_object_list_p2p.py +++ b/tests/test_legacy/test_comm/test_object_list_p2p.py @@ -1,7 +1,10 @@ import pytest import torch -from colossalai.communication.p2p import ( +from colossalai.context import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.initialize import launch +from colossalai.legacy.communication.p2p import ( recv_backward, recv_forward, send_backward, @@ -9,9 +12,6 @@ from colossalai.communication.p2p import ( send_forward, send_forward_recv_backward, ) -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.initialize import launch from colossalai.testing import rerun_if_address_is_in_use, spawn CONFIG = dict(parallel=dict(pipeline=2)) diff --git a/tests/test_comm/test_object_list_p2p_v2.py b/tests/test_legacy/test_comm/test_object_list_p2p_v2.py similarity index 97% rename from tests/test_comm/test_object_list_p2p_v2.py rename to tests/test_legacy/test_comm/test_object_list_p2p_v2.py index cae38385b..040c63322 100644 --- a/tests/test_comm/test_object_list_p2p_v2.py +++ b/tests/test_legacy/test_comm/test_object_list_p2p_v2.py @@ -1,10 +1,10 @@ import pytest import torch -from colossalai.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch +from colossalai.legacy.communication.p2p_v2 import recv_backward, recv_forward, send_backward, send_forward from colossalai.logging import disable_existing_loggers from colossalai.testing import rerun_if_address_is_in_use, spawn diff --git a/tests/test_engine/test_engine.py b/tests/test_legacy/test_engine/test_engine.py similarity index 100% rename from tests/test_engine/test_engine.py rename to tests/test_legacy/test_engine/test_engine.py diff --git a/tests/test_engine/test_gradient_accumluation.py b/tests/test_legacy/test_engine/test_gradient_accumluation.py similarity index 100% rename from tests/test_engine/test_gradient_accumluation.py rename to tests/test_legacy/test_engine/test_gradient_accumluation.py diff --git a/tests/test_layers/test_1d/checks_1d/__init__.py b/tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py similarity index 100% rename from tests/test_layers/test_1d/checks_1d/__init__.py rename to tests/test_legacy/test_layers/test_1d/checks_1d/__init__.py diff --git a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py similarity index 99% rename from tests/test_layers/test_1d/checks_1d/check_layer_1d.py rename to tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py index 668b8a334..dcb2be626 100644 --- a/tests/test_layers/test_1d/checks_1d/check_layer_1d.py +++ b/tests/test_legacy/test_layers/test_1d/checks_1d/check_layer_1d.py @@ -5,7 +5,7 @@ from torch.nn import Parameter from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.global_variables import tensor_parallel_env as env -from colossalai.nn import ( +from colossalai.legacy.nn import ( Classifier1D, Embedding1D, Linear1D_Col, diff --git a/tests/test_layers/test_1d/checks_1d/common.py b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py similarity index 94% rename from tests/test_layers/test_1d/checks_1d/common.py rename to tests/test_legacy/test_layers/test_1d/checks_1d/common.py index 8b7b28613..29a9a3d20 100644 --- a/tests/test_layers/test_1d/checks_1d/common.py +++ b/tests/test_legacy/test_layers/test_1d/checks_1d/common.py @@ -1,15 +1,16 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch - -DEPTH = 4 -BATCH_SIZE = 8 -SEQ_LENGTH = 8 -IMG_SIZE = 16 -HIDDEN_SIZE = 8 -NUM_CLASSES = 8 -VOCAB_SIZE = 16 - -def check_equal(A, B): - assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch + +DEPTH = 4 +BATCH_SIZE = 8 +SEQ_LENGTH = 8 +IMG_SIZE = 16 +HIDDEN_SIZE = 8 +NUM_CLASSES = 8 +VOCAB_SIZE = 16 + + +def check_equal(A, B): + assert torch.allclose(A, B, rtol=1e-3, atol=1e-1) == True diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_legacy/test_layers/test_1d/test_1d.py similarity index 100% rename from tests/test_layers/test_1d/test_1d.py rename to tests/test_legacy/test_layers/test_1d/test_1d.py diff --git a/tests/test_layers/test_2d/checks_2d/__init__.py b/tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py similarity index 100% rename from tests/test_layers/test_2d/checks_2d/__init__.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/__init__.py diff --git a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py similarity index 97% rename from tests/test_layers/test_2d/checks_2d/check_layer_2d.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py index e030e473a..0ee88c260 100644 --- a/tests/test_layers/test_2d/checks_2d/check_layer_2d.py +++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_layer_2d.py @@ -1,12 +1,23 @@ import torch + from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import (Classifier2D, CrossEntropyLoss2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, - VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2D, - VocabParallelCrossEntropyLoss2D, VocabParallelEmbedding2D) +from colossalai.legacy.nn import ( + Classifier2D, + CrossEntropyLoss2D, + Embedding2D, + LayerNorm2D, + Linear2D, + PatchEmbedding2D, + VanillaClassifier, + VanillaPatchEmbedding, + VocabParallelClassifier2D, + VocabParallelCrossEntropyLoss2D, + VocabParallelEmbedding2D, +) from colossalai.utils import get_current_device, print_rank_0 -from .common import (BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal) +from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal def check_linear(): @@ -336,7 +347,7 @@ def check_classifier_no_given_weight(): layer.weight.data.copy_(W) # W.requires_grad = True - B_shape = (OUTPUT_SIZE, ) + B_shape = (OUTPUT_SIZE,) B_master = torch.randint(5, B_shape, dtype=dtype, device=device) torch.distributed.broadcast(B_master, src=0) # B = torch.chunk(B_master, DEPTH, dim=0)[j] @@ -572,7 +583,7 @@ def check_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, DEPTH, dim=0)[i] @@ -607,7 +618,7 @@ def check_vocab_parallel_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, DEPTH, dim=0)[i] diff --git a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py similarity index 96% rename from tests/test_layers/test_2d/checks_2d/check_operation_2d.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py index a5e37b1ec..ae1d1120c 100644 --- a/tests/test_layers/test_2d/checks_2d/check_operation_2d.py +++ b/tests/test_legacy/test_layers/test_2d/checks_2d/check_operation_2d.py @@ -5,10 +5,10 @@ import torch from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D -from colossalai.utils import get_current_device -from colossalai.utils import print_rank_0 -from .common import check_equal, BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE, DEPTH +from colossalai.legacy.nn.layer.parallel_2d._operation import Matmul_AB_2D, Matmul_ABT_2D, Matmul_ATB_2D +from colossalai.utils import get_current_device, print_rank_0 + +from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, SEQ_LENGTH, check_equal def check_AB(): diff --git a/tests/test_layers/test_2d/checks_2d/common.py b/tests/test_legacy/test_layers/test_2d/checks_2d/common.py similarity index 100% rename from tests/test_layers/test_2d/checks_2d/common.py rename to tests/test_legacy/test_layers/test_2d/checks_2d/common.py diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_legacy/test_layers/test_2d/test_2d.py similarity index 100% rename from tests/test_layers/test_2d/test_2d.py rename to tests/test_legacy/test_layers/test_2d/test_2d.py diff --git a/tests/test_layers/test_2p5d/checks_2p5d/__init__.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py similarity index 100% rename from tests/test_layers/test_2p5d/checks_2p5d/__init__.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/__init__.py diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py similarity index 98% rename from tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py index a8f551093..5a99b05cf 100644 --- a/tests/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py +++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_layer_2p5d.py @@ -1,11 +1,22 @@ import torch +from torch.nn import Parameter + from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import (Classifier2p5D, CrossEntropyLoss2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, - PatchEmbedding2p5D, VanillaClassifier, VanillaPatchEmbedding, VocabParallelClassifier2p5D, - VocabParallelCrossEntropyLoss2p5D, VocabParallelEmbedding2p5D) +from colossalai.legacy.nn import ( + Classifier2p5D, + CrossEntropyLoss2p5D, + Embedding2p5D, + LayerNorm2p5D, + Linear2p5D, + PatchEmbedding2p5D, + VanillaClassifier, + VanillaPatchEmbedding, + VocabParallelClassifier2p5D, + VocabParallelCrossEntropyLoss2p5D, + VocabParallelEmbedding2p5D, +) from colossalai.utils import get_current_device, print_rank_0 -from torch.nn import Parameter from .common import * @@ -342,7 +353,7 @@ def check_classifier_no_given_weight(): layer.weight.data.copy_(W) # W.requires_grad = True - B_shape = (OUTPUT_SIZE, ) + B_shape = (OUTPUT_SIZE,) B_master = torch.randint(5, B_shape, dtype=dtype, device=device) torch.distributed.broadcast(B_master, src=0) # B = torch.chunk(B_master, TESSERACT_DIM, dim=0)[j] @@ -577,7 +588,7 @@ def check_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i] @@ -612,7 +623,7 @@ def check_vocab_parallel_loss(): out_shape = (BATCH_SIZE, NUM_CLASSES) out_master = torch.randn(out_shape, dtype=dtype, device=device) - target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE, ), dtype=torch.long, device=device) + target_master = torch.randint(NUM_CLASSES, (BATCH_SIZE,), dtype=torch.long, device=device) torch.distributed.broadcast(out_master, src=0) torch.distributed.broadcast(target_master, src=0) out = torch.chunk(out_master, TESSERACT_DIM, dim=0)[i] diff --git a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py similarity index 97% rename from tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py index d0c3b02fc..db1996767 100644 --- a/tests/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py +++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/check_operation_2p5d.py @@ -2,10 +2,9 @@ import torch from colossalai.context import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, \ - Matmul_ATB_2p5D -from colossalai.utils import get_current_device -from colossalai.utils import print_rank_0 +from colossalai.legacy.nn.layer.parallel_2p5d._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D +from colossalai.utils import get_current_device, print_rank_0 + from .common import * diff --git a/tests/test_layers/test_2p5d/checks_2p5d/common.py b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py similarity index 75% rename from tests/test_layers/test_2p5d/checks_2p5d/common.py rename to tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py index aff85f109..c90d8fc08 100644 --- a/tests/test_layers/test_2p5d/checks_2p5d/common.py +++ b/tests/test_legacy/test_layers/test_2p5d/checks_2p5d/common.py @@ -11,4 +11,4 @@ IMG_SIZE = 16 def check_equal(A, B): - assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) \ No newline at end of file + assert torch.allclose(A, B, rtol=1e-5, atol=1e-2) diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_legacy/test_layers/test_2p5d/test_2p5d.py similarity index 100% rename from tests/test_layers/test_2p5d/test_2p5d.py rename to tests/test_legacy/test_layers/test_2p5d/test_2p5d.py diff --git a/tests/test_layers/test_3d/checks_3d/__init__.py b/tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py similarity index 100% rename from tests/test_layers/test_3d/checks_3d/__init__.py rename to tests/test_legacy/test_layers/test_3d/checks_3d/__init__.py diff --git a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py similarity index 99% rename from tests/test_layers/test_3d/checks_3d/check_layer_3d.py rename to tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py index e946a1f59..cee639a9f 100644 --- a/tests/test_layers/test_3d/checks_3d/check_layer_3d.py +++ b/tests/test_legacy/test_layers/test_3d/checks_3d/check_layer_3d.py @@ -7,8 +7,7 @@ import torch from colossalai.constants import INPUT_GROUP_3D, OUTPUT_GROUP_3D, WEIGHT_GROUP_3D from colossalai.core import global_context -from colossalai.logging import get_dist_logger -from colossalai.nn import ( +from colossalai.legacy.nn import ( Classifier3D, CrossEntropyLoss3D, Embedding3D, @@ -21,7 +20,8 @@ from colossalai.nn import ( VocabParallelCrossEntropyLoss3D, VocabParallelEmbedding3D, ) -from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env +from colossalai.legacy.nn.layer.parallel_3d._utils import get_parallel_mode_from_env +from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device, print_rank_0 from .common import BATCH_SIZE, DEPTH, HIDDEN_SIZE, IMG_SIZE, NUM_CLASSES, SEQ_LENGTH, VOCAB_SIZE, check_equal diff --git a/tests/test_layers/test_3d/checks_3d/common.py b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py similarity index 95% rename from tests/test_layers/test_3d/checks_3d/common.py rename to tests/test_legacy/test_layers/test_3d/checks_3d/common.py index afb19c474..509fc2cec 100644 --- a/tests/test_layers/test_3d/checks_3d/common.py +++ b/tests/test_legacy/test_layers/test_3d/checks_3d/common.py @@ -16,4 +16,4 @@ VOCAB_SIZE = 16 def check_equal(A, B): eq = torch.allclose(A, B, rtol=1e-3, atol=1e-2) assert eq, f"\nA = {A}\nB = {B}" - return eq \ No newline at end of file + return eq diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_legacy/test_layers/test_3d/test_3d.py similarity index 100% rename from tests/test_layers/test_3d/test_3d.py rename to tests/test_legacy/test_layers/test_3d/test_3d.py diff --git a/tests/test_layers/test_cache_embedding.py b/tests/test_legacy/test_layers/test_cache_embedding.py similarity index 99% rename from tests/test_layers/test_cache_embedding.py rename to tests/test_legacy/test_layers/test_cache_embedding.py index 22d4f02a4..0760a3f1e 100644 --- a/tests/test_layers/test_cache_embedding.py +++ b/tests/test_legacy/test_layers/test_cache_embedding.py @@ -6,7 +6,7 @@ import pytest import torch import colossalai -from colossalai.nn.parallel.layers import ( +from colossalai.legacy.nn.parallel.layers import ( CachedEmbeddingBag, CachedParamMgr, EvictionStrategy, diff --git a/tests/test_layers/test_sequence/checks_seq/__init__.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py similarity index 100% rename from tests/test_layers/test_sequence/checks_seq/__init__.py rename to tests/test_legacy/test_layers/test_sequence/checks_seq/__init__.py diff --git a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py similarity index 91% rename from tests/test_layers/test_sequence/checks_seq/check_layer_seq.py rename to tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py index 2b7b999d4..7ff91a7b7 100644 --- a/tests/test_layers/test_sequence/checks_seq/check_layer_seq.py +++ b/tests/test_legacy/test_layers/test_sequence/checks_seq/check_layer_seq.py @@ -2,7 +2,7 @@ import torch from colossalai.context import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import TransformerSelfAttentionRing +from colossalai.legacy.nn import TransformerSelfAttentionRing from colossalai.utils import get_current_device diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_legacy/test_layers/test_sequence/test_sequence.py similarity index 97% rename from tests/test_layers/test_sequence/test_sequence.py rename to tests/test_legacy/test_layers/test_sequence/test_sequence.py index 60f2d55f4..b9e6c1247 100644 --- a/tests/test_layers/test_sequence/test_sequence.py +++ b/tests/test_legacy/test_layers/test_sequence/test_sequence.py @@ -5,6 +5,7 @@ import torch.distributed as dist import colossalai from colossalai.context import ParallelMode from colossalai.core import global_context as gpc +from colossalai.legacy.nn.layer.parallel_sequence import RingAV, RingQK from colossalai.testing import rerun_if_address_is_in_use, spawn CONFIG = dict(parallel=dict(tensor=dict(size=4, mode='sequence'))) @@ -42,7 +43,7 @@ def check_ring_qk(rank, world_size): a = torch.matmul(q, k.transpose(2, 1)) # compute distributed attention scores - ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply + ring_qk = RingQK.apply sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length) # check master and distributed attention scores @@ -95,7 +96,7 @@ def check_ring_av(rank, world_size): out = torch.matmul(a, v) # compute distributed attention scores - ring_av = colossalai.nn.layer.parallel_sequence.RingAV.apply + ring_av = RingAV.apply sub_out = ring_av(sub_a, sub_v, batch_size, num_heads, attention_head_size, sub_seq_length) # print(f'master output shape: {out.shape}, partial output shape: {sub_out.shape}') diff --git a/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py index 8ad366133..5fb678525 100644 --- a/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py +++ b/tests/test_legacy/test_trainer/test_pipeline/test_p2p.py @@ -5,7 +5,10 @@ import pytest import torch import torch.distributed as dist -from colossalai.communication import ( +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.initialize import launch +from colossalai.legacy.communication import ( recv_backward, recv_forward, recv_obj_meta, @@ -15,9 +18,6 @@ from colossalai.communication import ( send_forward_recv_backward, send_obj_meta, ) -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.initialize import launch from colossalai.logging import get_dist_logger from colossalai.testing import rerun_if_address_is_in_use, spawn from colossalai.utils import get_current_device diff --git a/tests/test_pipeline/test_cuda_rpc_performance.py b/tests/test_pipeline/test_cuda_rpc_performance.py deleted file mode 100644 index 4bacb2181..000000000 --- a/tests/test_pipeline/test_cuda_rpc_performance.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import time - -import pytest -import torch -import torch.nn as nn -from rpc_test_utils import parse_args, rpc_run -from titans.dataloader.cifar10 import build_cifar -from torchvision.models import resnet50 -from tqdm import tqdm - -from colossalai.pipeline.pipelinable import PipelinableContext -from colossalai.pipeline.rpc import OneFOneBPipelineEngine - - -def flatten(x): - return torch.flatten(x, 1) - - -def partition(pp_rank: int, chunk: int, stage_num: int): - pipelinable = PipelinableContext() - - # build model partitions - with pipelinable: - # input : [B, 3, 32, 32] - _ = resnet50() - - pipelinable.policy = "customized" - - exec_seq = [ - 'conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'avgpool', (flatten, "behind"), 'fc' - ] - pipelinable.to_layer_list(exec_seq) - partition = pipelinable.partition(chunk, stage_num, pp_rank) - return partition - - -def run_master(args): - batch_size = args.batch_size - chunk = args.chunk - device = args.device - world_size = args.world_size - stage_num = world_size - num_microbatches = args.num_microbatches - - # build dataloader - root = os.environ.get('DATA', './data') - train_dataloader, test_dataloader = build_cifar(batch_size, root, padding=4, crop=32, resize=32) - criterion = nn.CrossEntropyLoss() - - pp_engine = OneFOneBPipelineEngine(partition_fn=partition, - stage_num=stage_num, - num_microbatches=num_microbatches, - device=device, - chunk=chunk, - criterion=criterion, - checkpoint=False) - - pp_engine.initialize_optimizer(torch.optim.Adam, lr=1e-3) - s = time.time() - - for bx, by in tqdm(train_dataloader): - pp_engine.forward_backward(bx, labels=by, forward_only=False) - - cost_time = time.time() - s - - print("total cost time :", cost_time) - print("cost time per batch:", cost_time / len(train_dataloader)) - - -@pytest.mark.skip("Test for performance, no need for CI") -def main(): - args = parse_args() - # this is due to limitation of partition function - args.world_size = 2 - args.chunk = 1 - rpc_run(args, run_master) - - -if __name__ == '__main__': - main() diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py index 335be6135..9c3a7e216 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py @@ -7,7 +7,7 @@ import pytest import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py index 175d9ef6c..03b2e4f2a 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py @@ -7,7 +7,7 @@ import pytest import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py index 33cb3a65d..cafffd0a6 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py @@ -7,7 +7,7 @@ import pytest import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py index 73ac2dd5f..9b43be9e8 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py @@ -7,7 +7,7 @@ import pytest import torch import torch.nn as nn -import colossalai.nn as col_nn +import colossalai.legacy.nn as col_nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import launch