update examples and sphnix docs for the new api (#63)

pull/66/head
Frank Lee 2021-12-13 22:07:01 +08:00 committed by GitHub
parent 7d3711058f
commit 35813ed3c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
124 changed files with 1251 additions and 1462 deletions

View File

@ -14,10 +14,12 @@ Blog: [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Trai
pip install colossalai pip install colossalai
``` ```
### Install From Source ### Install From Source (Recommended)
> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)
```shell ```shell
git clone git@github.com:hpcaitech/ColossalAI.git git clone https://github.com/hpcaitech/ColossalAI.git
cd ColossalAI cd ColossalAI
# install dependency # install dependency
pip install -r requirements/requirements.txt pip install -r requirements/requirements.txt
@ -64,8 +66,8 @@ model = ...
# sampler by default # sampler by default
train_dataset = ... train_dataset = ...
train_dataloader = get_dataloader(dataset=dataset, train_dataloader = get_dataloader(dataset=dataset,
shuffle=True, shuffle=True,
) )
# build your # build your

View File

@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
criterion: _Loss, criterion: _Loss,
mode: AMP_TYPE, mode: AMP_TYPE,
amp_config: Config = None): amp_config: Config = None):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param criterion: your loss function object
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param mode: amp mode
:type mode: :class:`colossalai.amp.AMP_TYPE`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion)
:rtype: Tuple
"""
assert isinstance(mode, AMP_TYPE), \ assert isinstance(mode, AMP_TYPE), \
f'expected the argument mode be AMP_TYPE, but got {type(mode)}' f'expected the argument mode be AMP_TYPE, but got {type(mode)}'

View File

@ -7,6 +7,18 @@ import apex.amp as apex_amp
def convert_to_apex_amp(model: nn.Module, def convert_to_apex_amp(model: nn.Module,
optimizer: Optimizer, optimizer: Optimizer,
amp_config): amp_config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param amp_config: configuration for nvidia apex
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple
"""
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config) model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
optimizer = ApexAMPOptimizer(optimizer) optimizer = ApexAMPOptimizer(optimizer)
return model, optimizer return model, optimizer

View File

@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32
class ApexAMPOptimizer(ColossalaiOptimizer): class ApexAMPOptimizer(ColossalaiOptimizer):
''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
methods
'''
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
with apex_amp.scale_loss(loss, self.optim) as scaled_loss: with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
def clip_grad_norm(self, model: nn.Module, max_norm: float): def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""
:param model: your model object
:type model: torch.nn.Module
:param max_norm: the max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0: if max_norm > 0:
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm) clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)

View File

@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
def convert_to_naive_amp(model: nn.Module, def convert_to_naive_amp(model: nn.Module,
optimizer: Optimizer, optimizer: Optimizer,
amp_config): amp_config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param amp_config: configuration for naive mode amp
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple
"""
if is_no_pp_or_last_stage(): if is_no_pp_or_last_stage():
model = NaiveAMPModel(model, output_to_fp32=True) model = NaiveAMPModel(model, output_to_fp32=True)
else: else:

View File

@ -146,26 +146,22 @@ class DynamicGradScaler:
class FP16Optimizer(Optimizer): class FP16Optimizer(Optimizer):
"""Float16 optimizer for fp16 and bf16 data types. """Float16 optimizer for fp16 and bf16 data types.
Arguments: :param optimizer: base optimizer such as Adam or SGD
optimizer: base optimizer such as Adam or SGD :type optimizer: torch.optim.Optimizer
clip_grad: clip gradeints with this global L2 norm. Note :param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
that clipping is ignored if clip_grad == 0 :type param clip_grad: float
log_num_zeros_in_grad: return number of zeros in the gradients. :param log_num_zeros_in_grad: return number of zeros in the gradients.
params_have_main_grad: flag indicating if parameters have :type log_num_zeros_in_grad: bool
a `main_grad` field. If this is set, we are assuming :param initial_scale: initial scale of gradient scaler
that the model parameters are store in the `main_grad` :type initial_scale: int
field instead of the typical `grad` field. This happens :param growth_factor: the growth rate of loss scale
for the DDP cases where there is a contihuous buffer :type growth_factor: int
holding the gradients. For example for bfloat16, we want :param backoff_factor: the decrease rate of loss scale
to do gradient accumulation and all-reduces in float32 :type backoff_factor: float
and as a result we store those gradients in the main_grad. :param hysterisis: delay shift in dynamic loss scaling
Note that main grad is not necessarily in float32. :type hysterisis: int
bf16: if true, the model is running in bfloat16. :param max_scale: maximum loss scale allowed
grad_scaler: used for scaling gradients. Note that this can be :type max_scale: int
None. This case happens when `bf16 = True` and we don't
use any loss scale. Note that for `bf16 = True`, we can have
a constnat gradient scaler. Also for `bf16 = False`, we
always require a grad scaler.
""" """
def __init__(self, def __init__(self,

View File

@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer
class NaiveAMPOptimizer(ColossalaiOptimizer): class NaiveAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class for optimizer to cast all parameters to fp16
:param optim: a normal optimizer like Adam or SGD
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs): def __init__(self, optim: Optimizer, *args, **kwargs):
optim = FP16Optimizer(optimizer=optim, *args, **kwargs) optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
super().__init__(optim) super().__init__(optim)
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""backward with gradient scaler
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
loss = self.optim.scale_loss(loss) loss = self.optim.scale_loss(loss)
loss.backward() loss.backward()
@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
class NaiveAMPModel(nn.Module): class NaiveAMPModel(nn.Module):
"""A wrapper class for model to cast the model into fp16 and
automatically cast the input and output
"""
def __init__(self, def __init__(self,
model: nn.Module, model: nn.Module,

View File

@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
optimizer: Optimizer, optimizer: Optimizer,
criterion: _Loss, criterion: _Loss,
amp_config: Config): amp_config: Config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param criterion: your loss function object
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion)
:rtype: Tuple
"""
model = TorchAMPModel(model) model = TorchAMPModel(model)
optimizer = TorchAMPOptimizer(optimizer, **amp_config) optimizer = TorchAMPOptimizer(optimizer, **amp_config)
criterion = TorchAMPLoss(criterion) criterion = TorchAMPLoss(criterion)

View File

@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32
class TorchAMPOptimizer(ColossalaiOptimizer): class TorchAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class which integrate pytorch amp with an optimizer
:param optim: a normal optimizer like Adam or SGD
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs): def __init__(self, optim: Optimizer, *args, **kwargs):
super().__init__(optim) super().__init__(optim)
self.scaler = GradScaler(*args, **kwargs) self.scaler = GradScaler(*args, **kwargs)
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""backward with torch amp gradient scaler
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
self.scaler.scale(loss).backward() self.scaler.scale(loss).backward()
def step(self): def step(self):
"""update the parameters of the model
"""
self.scaler.step(self.optim) self.scaler.step(self.optim)
self.scaler.update() self.scaler.update()
def clip_grad_norm(self, model: nn.Module, max_norm: float): def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""apply gradient clipping to the model parameters
:param model: your model object
:type model: torch.nn.Module
:param max_norm: max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0.0: if max_norm > 0.0:
self.scaler.unscale_(self.optim) self.scaler.unscale_(self.optim)
clip_grad_norm_fp32(model.parameters(), max_norm) clip_grad_norm_fp32(model.parameters(), max_norm)
class TorchAMPModel(nn.Module): class TorchAMPModel(nn.Module):
"""A wrapper class for a model object which executes forward with values automatically
cast to fp16
"""
def __init__(self, model: nn.Module) -> None: def __init__(self, model: nn.Module) -> None:
super().__init__() super().__init__()
@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):
class TorchAMPLoss(nn.Module): class TorchAMPLoss(nn.Module):
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
:param loss: a loss function object
:type loss: torch.nn.modules.loss._Loss
"""
def __init__(self, loss: _Loss): def __init__(self, loss: _Loss):
super().__init__() super().__init__()
self.loss = loss self.loss = loss

View File

@ -16,8 +16,8 @@ def build_from_config(module, config: dict):
of the return object of the return object
:type config: dict :type config: dict
:raises AssertionError: Raises an AssertionError if `module` is not a class :raises AssertionError: Raises an AssertionError if `module` is not a class
:return: An object of :class:`module` :return: An object of interest
:rtype: :class:`module` :rtype: Object
""" """
assert inspect.isclass(module), 'module must be a class' assert inspect.isclass(module), 'module must be a class'
return module(**config) return module(**config)
@ -62,8 +62,8 @@ def build_layer(config):
:param config: A python dict or a :class:`colossalai.context.Config` object :param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config` :type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`nn.Module` :return: An object of :class:`torch.nn.Module`
:rtype: :class:`nn.Module` :rtype: :class:`torch.nn.Module`
""" """
return build_from_registry(config, LAYERS) return build_from_registry(config, LAYERS)
@ -75,8 +75,8 @@ def build_loss(config):
:param config: A python dict or a :class:`colossalai.context.Config` object :param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config` :type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torch.autograd.Function` :return: An object of :class:`torch.nn.modules.loss._Loss`
:rtype: :class:`torch.autograd.Function` :rtype: :class:`torch.nn.modules.loss._Loss`
""" """
return build_from_registry(config, LOSSES) return build_from_registry(config, LOSSES)
@ -87,8 +87,8 @@ def build_model(config):
:param config: A python dict or a :class:`colossalai.context.Config` object :param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config` :type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`nn.Module` :return: An object of :class:`torch.nn.Module`
:rtype: :class:`nn.Module` :rtype: :class:`torch.nn.Module`
""" """
return build_from_registry(config, MODELS) return build_from_registry(config, MODELS)
@ -134,8 +134,8 @@ def build_gradient_handler(config, model, optimizer):
:type model: :class:`nn.Module` :type model: :class:`nn.Module`
:param optimizer: An optimizer object containing parameters for the gradient handler :param optimizer: An optimizer object containing parameters for the gradient handler
:type optimizer: :class:`torch.optim.Optimizer` :type optimizer: :class:`torch.optim.Optimizer`
:return: An object of :class:`BaseGradientHandler` :return: An object of :class:`colossalai.engine.BaseGradientHandler`
:rtype: :class:`BaseGradientHandler` :rtype: :class:`colossalai.engine.BaseGradientHandler`
""" """
config_ = config.copy() config_ = config.copy()
config_['model'] = model config_['model'] = model
@ -151,8 +151,8 @@ def build_hooks(config, trainer):
:type config: dict or :class:`colossalai.context.Config` :type config: dict or :class:`colossalai.context.Config`
:param trainer: A :class:`Trainer` object containing parameters for the hook :param trainer: A :class:`Trainer` object containing parameters for the hook
:type trainer: :class:`Trainer` :type trainer: :class:`Trainer`
:return: An object of :class:`BaseHook` :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
:rtype: :class:`BaseHook` :rtype: :class:`colossalai.trainer.hooks.BaseHook`
""" """
config_ = config.copy() config_ = config.copy()
config_['trainer'] = trainer config_['trainer'] = trainer
@ -182,8 +182,8 @@ def build_data_sampler(config, dataset):
:param dataset: An object of :class:`torch.utils.data.Dataset` containing information :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
used in the construction of the return object used in the construction of the return object
:type dataset: :class:`torch.utils.data.Dataset` :type dataset: :class:`torch.utils.data.Dataset`
:return: An object of :class:`colossalai.nn.data.sampler.BaseSampler` :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
:rtype: :class:`colossalai.nn.data.sampler.BaseSampler` :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
""" """
config_ = config.copy() config_ = config.copy()
config_['dataset'] = dataset config_['dataset'] = dataset
@ -200,10 +200,6 @@ def build_lr_scheduler(config, optimizer):
:param optimizer: An optimizer object containing parameters for the learning rate :param optimizer: An optimizer object containing parameters for the learning rate
scheduler scheduler
:type optimizer: :class:`torch.optim.Optimizer` :type optimizer: :class:`torch.optim.Optimizer`
:param total_steps: Number of total steps of the learning rate scheduler
:type total_steps: int
:param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
:type num_steps_per_epoch: int
:return: An object of :class:`torch.optim.lr_scheduler` :return: An object of :class:`torch.optim.lr_scheduler`
:rtype: :class:`torch.optim.lr_scheduler` :rtype: :class:`torch.optim.lr_scheduler`
""" """

View File

@ -151,6 +151,28 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):
class PipelineModelInitializer(): class PipelineModelInitializer():
"""An intializer to split the model into different stages for pipeline parallelism.
An example for the model config is shown below. The class VisionTransformerFromConfig should
inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
of layer configurations.
model_config = dict(
type='VisionTransformerFromConfig',
embedding_cfg=dict(...),
...
)
:param config: configuration of the model
:type config: dict
:param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
in most cases unless you are using virutal pipeline parallelism.
:type num_chunks: int
:param verbose: whether to print the logs
:type verbose: bool
"""
def __init__(self, config, num_chunks, verbose=False): def __init__(self, config, num_chunks, verbose=False):
self.num_chunks = num_chunks self.num_chunks = num_chunks
self.ori_model = build_model(config) self.ori_model = build_model(config)
@ -161,6 +183,13 @@ class PipelineModelInitializer():
self._logger.info(f"The total length of layers is {layer_length}", ranks=[0]) self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])
def initialize(self, partition_method='parameter'): def initialize(self, partition_method='parameter'):
"""Initialize the model object from the config passed
:param partition_method: this parameter determines how you want to split your model layers into stages,
you can set it as 'layer' or 'parameter'
:type partition_method: str
"""
# Some space for initializing comunication groups # Some space for initializing comunication groups
self._interval = None self._interval = None
self._partition_layers(method=partition_method) self._partition_layers(method=partition_method)
@ -183,7 +212,7 @@ class PipelineModelInitializer():
# print_rank_0(param_counts) # print_rank_0(param_counts)
self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks) self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
else: else:
assert method == 'layer', "Method should be a pre-set string" raise ValueError("Method should be a pre-set string in [layer, parameter]")
# Display the partition # Display the partition
if gpc.get_global_rank() == 0 and self.verbose: if gpc.get_global_rank() == 0 and self.verbose:

View File

@ -18,11 +18,11 @@ def all_gather(tensor: Tensor, dim: int,
:param tensor: Tensor to be gathered :param tensor: Tensor to be gathered
:param dim: The dimension concatenating in :param dim: The dimension concatenating in
:param parallel_mode: Parallel group mode used in this communication :param parallel_mode: Parallel group mode used in this communication
:type tensor: Tensor :type tensor: :class:`torch.Tensor`
:type dim: int :type dim: int
:type parallel_mode: ParallelMode :type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor generated by all-gather :return: The tensor generated by all-gather
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
temp = tensor.clone() temp = tensor.clone()
@ -54,11 +54,11 @@ def reduce_scatter(tensor: Tensor, dim: int,
:param tensor: Tensor to be reduced and scattered :param tensor: Tensor to be reduced and scattered
:param dim: The dimension scattering in :param dim: The dimension scattering in
:param parallel_mode: Parallel group mode used in this communication :param parallel_mode: Parallel group mode used in this communication
:type tensor: Tensor :type tensor: :class:`torch.Tensor`
:type dim: int :type dim: int
:type parallel_mode: ParallelMode :type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor generated by reduce-scatter :return: The tensor generated by reduce-scatter
:rtype: Tensor :rtype: :class:`Tensor`
""" """
depth = gpc.get_world_size(parallel_mode) depth = gpc.get_world_size(parallel_mode)
# temp = list(torch.chunk(tensor, depth, dim=dim)) # temp = list(torch.chunk(tensor, depth, dim=dim))

View File

@ -96,7 +96,7 @@ def recv_forward(input_tensor_shape, prev_rank=None):
:type input_tensor_shape: torch.Size :type input_tensor_shape: torch.Size
:type prev_rank: int, optional :type prev_rank: int, optional
:return: The input tensor in forward step :return: The input tensor in forward step
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
if gpc.is_first_rank(ParallelMode.PIPELINE): if gpc.is_first_rank(ParallelMode.PIPELINE):
input_tensor = None input_tensor = None
@ -115,7 +115,7 @@ def recv_backward(output_grad_shape, next_rank=None):
:type output_grad_shape: torch.Size :type output_grad_shape: torch.Size
:type next_rank: int, optional :type next_rank: int, optional
:return: The grad of output tensor in forward step :return: The grad of output tensor in forward step
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
if gpc.is_last_rank(ParallelMode.PIPELINE): if gpc.is_last_rank(ParallelMode.PIPELINE):
output_tensor_grad = None output_tensor_grad = None
@ -131,7 +131,7 @@ def send_forward(output_tensor, next_rank=None):
:param output_tensor: Tensor to be sent :param output_tensor: Tensor to be sent
:param next_rank: The rank of the recipient of the tensor :param next_rank: The rank of the recipient of the tensor
:type output_tensor: Tensor :type output_tensor: :class:`torch.Tensor`
:type next_rank: int, optional :type next_rank: int, optional
""" """
if not gpc.is_last_rank(ParallelMode.PIPELINE): if not gpc.is_last_rank(ParallelMode.PIPELINE):
@ -144,7 +144,7 @@ def send_backward(input_tensor_grad, prev_rank=None):
:param input_tensor_grad: Tensor to be sent :param input_tensor_grad: Tensor to be sent
:param prev_rank: The rank of the recipient of the tensor :param prev_rank: The rank of the recipient of the tensor
:type input_tensor_grad: Tensor :type input_tensor_grad: :class:`torch.Tensor`
:type prev_rank: int, optional :type prev_rank: int, optional
""" """
if not gpc.is_first_rank(ParallelMode.PIPELINE): if not gpc.is_first_rank(ParallelMode.PIPELINE):
@ -162,10 +162,10 @@ def send_forward_recv_backward(output_tensor,
:param output_tensor: Tensor to be sent :param output_tensor: Tensor to be sent
:param output_grad_shape: The shape of the tensor to be recieved :param output_grad_shape: The shape of the tensor to be recieved
:type output_tensor: Tensor :type output_tensor: :class:`torch.Tensor`
:type output_grad_shape: torch.Size :type output_grad_shape: :class:`torch.Size`
:return: The grad of output tensor in forward step :return: The grad of output tensor in forward step
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
if gpc.is_last_rank(ParallelMode.PIPELINE): if gpc.is_last_rank(ParallelMode.PIPELINE):
output_tensor_grad = None output_tensor_grad = None
@ -187,10 +187,10 @@ def send_backward_recv_forward(input_tensor_grad,
:param input_tensor_grad: Tensor to be sent :param input_tensor_grad: Tensor to be sent
:param input_tensor_shape: The shape of the tensor to be recieved :param input_tensor_shape: The shape of the tensor to be recieved
:type input_tensor_grad: Tensor :type input_tensor_grad: :class:`torch.Tensor`
:type input_tensor_shape: torch.Size :type input_tensor_shape: :class:`torch.Size`
:return: The input tensor in forward step :return: The input tensor in forward step
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
if gpc.is_first_rank(ParallelMode.PIPELINE): if gpc.is_first_rank(ParallelMode.PIPELINE):
input_tensor = None input_tensor = None
@ -213,10 +213,10 @@ def send_forward_recv_forward(output_tensor,
:param output_tensor: Tensor to be sent :param output_tensor: Tensor to be sent
:param input_tensor_shape: The shape of the tensor to be recieved :param input_tensor_shape: The shape of the tensor to be recieved
:type output_tensor: Tensor :type output_tensor: :class:`torch.Tensor`
:type input_tensor_shape: torch.Size :type input_tensor_shape: :class:`torch.Size`
:return: The input tensor in forward step :return: The input tensor in forward step
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
input_tensor, _ = _communicate(tensor_send_next=output_tensor, input_tensor, _ = _communicate(tensor_send_next=output_tensor,
recv_prev=recv_prev, recv_prev=recv_prev,
@ -237,10 +237,10 @@ def send_backward_recv_backward(input_tensor_grad,
:param input_tensor_grad: Tensor to be sent :param input_tensor_grad: Tensor to be sent
:param output_grad_shape: The shape of the tensor to be recieved :param output_grad_shape: The shape of the tensor to be recieved
:type input_tensor_grad: Tensor :type input_tensor_grad: :class:`torch.Tensor`
:type output_grad_shape: torch.Size :type output_grad_shape: :class:`torch.Size`
:return: The grad of output tensor in forward step :return: The grad of output tensor in forward step
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
_, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad, _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
recv_next=recv_next, recv_next=recv_next,
@ -266,10 +266,10 @@ def send_forward_backward_recv_forward_backward(output_tensor,
:param input_tensor_grad: Tensor sent to the previous :param input_tensor_grad: Tensor sent to the previous
:param input_tensor_shape: The shape of the tensor recieved from the previous :param input_tensor_shape: The shape of the tensor recieved from the previous
:param output_grad_shape: The shape of the tensor recieved from the next :param output_grad_shape: The shape of the tensor recieved from the next
:type output_tensor: Tensor :type output_tensor: :class:`torch.Tensor`
:type input_tensor_grad: Tensor :type input_tensor_grad: :class:`torch.Tensor`
:type input_tensor_shape: torch.Size :type input_tensor_shape: :class:`torch.Size`
:type output_grad_shape: torch.Size :type output_grad_shape: :class:`torch.Size`
:return: (the input tensor in forward step, the grad of output tensor in forward step) :return: (the input tensor in forward step, the grad of output tensor in forward step)
:rtype: (Tensor, Tensor) :rtype: (Tensor, Tensor)
""" """

View File

@ -14,10 +14,10 @@ def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
:param tensor_send_next: Tensor sent to next member :param tensor_send_next: Tensor sent to next member
:param parallel_mode: Parallel group mode used in this communication :param parallel_mode: Parallel group mode used in this communication
:type tensor_send_next: Tensor :type tensor_send_next: :class:`torch.Tensor`
:type parallel_mode: ParallelMode :type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor recieved from the previous :return: The tensor recieved from the previous
:rtype: Tensor :rtype: :class:`torch.Tensor`
""" """
buffer_shape = tensor_send_next.size() buffer_shape = tensor_send_next.size()

View File

@ -433,6 +433,9 @@ class ParallelContext:
def set_device(self, device_ordinal: int = None): def set_device(self, device_ordinal: int = None):
"""Sets distributed processes to be bound to devices. """Sets distributed processes to be bound to devices.
:param device_ordinal: the device id to be bound to
:type device_ordinal: int
""" """
global_rank = self.get_global_rank() global_rank = self.get_global_rank()
if device_ordinal is None: if device_ordinal is None:
@ -445,6 +448,9 @@ class ParallelContext:
def set_seed(self, seed: int): def set_seed(self, seed: int):
"""Sets seeds for all random libraries. """Sets seeds for all random libraries.
:param seed: seed for random states
:type seed: int
""" """
random.seed(seed) random.seed(seed)
np.random.seed(seed) np.random.seed(seed)

View File

@ -57,38 +57,61 @@ class Engine:
@property @property
def model(self): def model(self):
"""model attached to the engine"""
return self._model return self._model
@property @property
def optimizer(self): def optimizer(self):
"""optimizer attached to the engine"""
return self._optimizer return self._optimizer
@property @property
def criterion(self): def criterion(self):
"""criterion attached to the engine"""
return self._criterion return self._criterion
@property
def schedule(self):
return self._schedule
def zero_grad(self): def zero_grad(self):
"""set the gradient of parameters to zero
"""
self.optimizer.zero_grad() self.optimizer.zero_grad()
def step(self): def step(self):
"""execute parameter update
"""
self._all_reduce_gradients() self._all_reduce_gradients()
self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm) self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
self.optimizer.step() self.optimizer.step()
def backward(self, loss: Tensor): def backward(self, loss: Tensor):
"""Start backward propagation given the loss value computed by a loss function
:param loss: loss value computed by a loss function
:type loss: :class:`torch.Tensor`
"""
return self.optimizer.backward(loss) return self.optimizer.backward(loss)
def backward_by_grad(self, tensor, grad): def backward_by_grad(self, tensor, grad):
"""Start backward propagation given the gradient of the output tensor
:param loss: output tensor
:type loss: :class:`torch.Tensor`
:param grad: gradient passed back to the output
:type grad: :class:`torch.Tensor`
"""
return self.optimizer.backward_by_grad(tensor, grad) return self.optimizer.backward_by_grad(tensor, grad)
def calc_loss(self, *args, **kwargs): def calc_loss(self, *args, **kwargs):
"""compute the loss value
:return: the loss value
:rtype: :class:`torch.Tensor`
"""
return self.criterion(*args, **kwargs) return self.criterion(*args, **kwargs)
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
"""run the forward step for the model
:return: output the model
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
"""
return self.model(*args, **kwargs) return self.model(*args, **kwargs)
def _all_reduce_gradients(self): def _all_reduce_gradients(self):

View File

@ -48,7 +48,7 @@ class BaseSchedule(ABC):
already in the same GPU as where the model's. already in the same GPU as where the model's.
:return: (data, label) :return: (data, label)
:rtype: (Tensor, Tensor) :rtype: (:class:`Tensor`, :class:`torch.Tensor`)
""" """
if data_iter is None: if data_iter is None:
raise RuntimeError('Dataloader is not defined.') raise RuntimeError('Dataloader is not defined.')

View File

@ -38,7 +38,9 @@ class NonPipelineSchedule(BaseSchedule):
:type data_iter: Iterator :type data_iter: Iterator
:type forward_only: bool, optional :type forward_only: bool, optional
:type return_loss: bool, optional :type return_loss: bool, optional
:return: (output, label, loss) :return: (output, label, loss)
:rtype: Tuple[:class:`torch.Tensor`]
""" """
assert forward_only or return_loss, \ assert forward_only or return_loss, \
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False." "The argument 'return_loss' has to be True when 'forward_only' is False, but got False."

View File

@ -133,6 +133,16 @@ class PipelineSchedule(BaseSchedule):
"""Forward step for passed-in model. If it is the first stage, the input tensor """Forward step for passed-in model. If it is the first stage, the input tensor
is obtained from data_iterator, otherwise the passed-in input_tensor is used. is obtained from data_iterator, otherwise the passed-in input_tensor is used.
Returns output tensor. This is a helper function and can be ignored by users. Returns output tensor. This is a helper function and can be ignored by users.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param input_tensor: input tensor for this pipeline stage
:type input_tensor: :class:`torch.Tensor`
:param return_tensors: a list of tensors to return
:type return_tensors: List[:class:`torch.Tensor`]
:return: output or the loss value of the current pipeline stage
:rtype: :class:`torch.Tensor`
""" """
if input_tensor is None: if input_tensor is None:
@ -162,6 +172,18 @@ class PipelineSchedule(BaseSchedule):
output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor. output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
Returns the gradients with respect to the input tensor (None if first stage). Returns the gradients with respect to the input tensor (None if first stage).
This is a helper function and can be ignored by users. This is a helper function and can be ignored by users.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param input_tensor: input tensor for this pipeline stage
:type input_tensor: :class:`torch.Tensor`
:param output_tensor: output tensor for this pipeline stage
:type output_tensor: :class:`torch.Tensor`
:param output_tensor_grad: gradient of output tensor for this pipeline stage
:type output_tensor_grad: :class:`torch.Tensor`
:return: gradient of input tensor
:rtype: :class:`torch.Tensor`
""" """
# Retain the grad on the input_tensor. # Retain the grad on the input_tensor.
@ -189,7 +211,17 @@ class PipelineSchedule(BaseSchedule):
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages. """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
Returns a tuple with losses if the last stage, an empty tuple otherwise. Returns a tuple with losses if the last stage, an empty tuple otherwise.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
:type data_iter: Iterable
:param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
:type forward_only: bool
:param return_loss: whether returns the loss value. Default is true.
:type return_loss: bool
:return: (output, label, loss) :return: (output, label, loss)
:rtype: Tuple[:class:`torch.Tensor`]
""" """
assert forward_only or return_loss, \ assert forward_only or return_loss, \

View File

@ -82,6 +82,8 @@ def launch(config: Union[str, Path, Config, Dict],
:param local_rank: rank for the process on the node and is used to set the default CUDA device, :param local_rank: rank for the process on the node and is used to set the default CUDA device,
defaults to None. If local_rank = None, the default device ordinal will be calculated automatically defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
:type local_rank: int, optional :type local_rank: int, optional
:param verbose: whether to print logs
:type verbose: bool
:raises Exception: raise exception when config type is wrong :raises Exception: raise exception when config type is wrong
''' '''
gpc.verbose = verbose gpc.verbose = verbose
@ -121,6 +123,20 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
backend: str = 'nccl', backend: str = 'nccl',
seed: int = 1024, seed: int = 1024,
verbose: bool = True): verbose: bool = True):
'''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
set by SLURM
:param config: config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
rank = int(os.environ['SLURM_PROCID']) rank = int(os.environ['SLURM_PROCID'])
world_size = int(os.environ['SLURM_NPROCS']) world_size = int(os.environ['SLURM_NPROCS'])
launch(config=config, launch(config=config,
@ -139,6 +155,20 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
backend: str = 'nccl', backend: str = 'nccl',
seed: int = 1024, seed: int = 1024,
verbose: bool = True): verbose: bool = True):
'''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
set by OpenMPI
:param config: config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
rank = int(os.environ['OMPI_COMM_WORLD_RANK']) rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@ -159,6 +189,20 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
backend: str = 'nccl', backend: str = 'nccl',
seed: int = 1024, seed: int = 1024,
verbose: bool = True): verbose: bool = True):
'''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch
:param config: config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
rank = int(os.environ['RANK']) rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK']) local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE']) world_size = int(os.environ['WORLD_SIZE'])
@ -184,16 +228,20 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config. ''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
:param model: your model instance :param model: your model instance
:type model: a single or a list of ``torch.nn.Module`` objects :type model: :class:`torch.nn.Module`
:param optimizer: your optimizer instance :param optimizer: your optimizer instance
:type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects :type optimizer: :class:`torch.optim.optimizer.Optimizer`
:param criterion: your criterion instance :param criterion: your criterion instance
:type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects :type criterion: :class:`torch.nn.modules.loss._Loss`
:param train_dataloader: dataloaders for training data :param train_dataloader: dataloader for training data
:type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None :type train_dataloader: :class:`torch.utils.data.DataLoader`
:param train_dataloader: dataloaders for testing data :param train_dataloader: dataloader for testing data
:type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None :type train_dataloader: :class:`torch.utils.data.DataLoader`
:return: (engine, criterion, train_dataloader, test_dataloader) :param lr_scheduler: your lr scheduler instance
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
:param verbose: whether to print logs
:type verbose: bool
:return: (engine, train_dataloader, test_dataloader, lr_scheduler)
:rtype: tuple :rtype: tuple
''' '''
# get logger # get logger

View File

@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
def get_dist_logger(name='root'): def get_dist_logger(name='root'):
"""Get logger instance based on name. The DistributedLogger will create singleton instances, """Get logger instance based on name. The DistributedLogger will create singleton instances,
which means that only one logger instance is created per name. which means that only one logger instance is created per name.
:param name: name of the logger, name must be unique
:type name: str
:return: a distributed logger instance
:rtype: :class:`colossalai.logging.DistributedLogger`
""" """
return DistributedLogger.get_instance(name=name) return DistributedLogger.get_instance(name=name)

View File

@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaViTPatchEmbedding(nn.Module): class VanillaViTPatchEmbedding(nn.Module):
""" 2D Image to Patch Embedding """ 2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: size of a patch
:type patch_size: int
:param in_chans: input channels
:type in_chans: int
:param embed_dim: embedding dimension
:type embed_dim: int
:param norm_layer: layer norm class, defaults to None
:type norm_layer: Callable
:param flattern: whether flatten the output
:type flatten: bool
:param drop: dropout rate
:type drop: float
""" """
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.): def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
super().__init__() super().__init__()
img_size = to_2tuple(img_size) img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size) patch_size = to_2tuple(patch_size)
@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
@LAYERS.register_module @LAYERS.register_module
class VanillaViTMLP(nn.Module): class VanillaViTMLP(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks """ MLP as used in Vision Transformer, MLP-Mixer and related networks
:param in_features: input channels
:type in_features: int
:param hidden_features: channels of the output of the first dense layer
:type hidden_features: int
:param hidden_features: channels of the output of the second dense layer
:type hidden_features: int
:param act_layer: activation function
:type act_layer: Callable
:param drop: dropout rate
:type drop: float
""" """
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
super().__init__() super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features) self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer() self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features) self.fc2 = nn.Linear(hidden_features, out_features)
@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument. 'survival rate' as the argument.
:param drop_prob: probability for dropout
:type drop_prob: float
:param training: whether it is training mode
:type training: bool
""" """
if drop_prob == 0. or not training: if drop_prob == 0. or not training:
return x return x
@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
@LAYERS.register_module @LAYERS.register_module
class VanillaViTDropPath(nn.Module): class VanillaViTDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
:param drop_prob: probability for dropout
:type drop_path: float
""" """
def __init__(self, drop_prob=0.): def __init__(self, drop_prob=0.):
@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
:param dim: dimension of input tensor :param dim: dimension of input tensor
:type dim: int :type dim: int
:param num_heads: number of attention heads, defaults to 8 :param num_heads: number of attention heads
:type num_heads: int, optional :type num_heads: int, optional
:param qkv_bias: enable bias for qkv if True, defaults to False :param qkv_bias: enable bias for qkv if True, defaults to False
:type qkv_bias: bool, optional :type qkv_bias: bool, optional
@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
:type proj_drop: float, optional :type proj_drop: float, optional
""" """
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__() super().__init__()
self.num_heads = num_heads self.num_heads = num_heads
head_dim = dim // num_heads head_dim = dim // num_heads

View File

@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
added functionality to handle model parallel parameters. Note that added functionality to handle model parallel parameters. Note that
the gradients are modified in place. the gradients are modified in place.
Arguments: :param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a :type parameters: (Iterable[Tensor] or Tensor)
single Tensor that will have gradients normalized :param max_norm: max norm of the gradients
max_norm (float or int): max norm of the gradients :type max_norm: float or int
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for :param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
infinity norm. :type norm_type: float or int
Returns: :return: Total norm of the parameters (viewed as a single vector).
Total norm of the parameters (viewed as a single vector). :rtype: float
""" """
if isinstance(parameters, torch.Tensor): if isinstance(parameters, torch.Tensor):

View File

@ -123,12 +123,23 @@ def get_dataloader(dataset,
stage and label on the last stage stage and label on the last stage
:param dataset: a :class:utils.data.dataset dataset :param dataset: a :class:utils.data.dataset dataset
:param shuffle: whether to shuffle the dataset
:param seed: random worker seed, defaults to 1024 :param seed: random worker seed, defaults to 1024
:type seed: int, optional :param add_sampler: add DistributedDataParallelSampelr to the dataset
:param add_sampler_if_possible: [description], defaults to False :param drop_last: drop the last incomplete batch of data
:type add_sampler_if_possible: bool, optional :param pin_memory: whether to pin memory address in CPU memory
:return: a :class:utils.data.dataset dataloader :param num_workers: number of worker threads for this dataloader
:rtype: torch.utils.data.dataset
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: a object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
''' '''
_kwargs = kwargs.copy() _kwargs = kwargs.copy()

View File

@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
accumulate_size: int, accumulate_size: int,
gradient_handlers: List[BaseGradientHandler] = None, gradient_handlers: List[BaseGradientHandler] = None,
lr_scheduler: _LRScheduler = None): lr_scheduler: _LRScheduler = None):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
"""
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model) optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size) dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

View File

@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer): class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
:param model: your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
"""
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None): def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
super().__init__(optim) super().__init__(optim)
@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class GradAccumDataloader(): class GradAccumDataloader():
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None: def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
self.dataloader = dataloader self.dataloader = dataloader
@ -99,6 +123,15 @@ class GradAccumDataloader():
class GradAccumLrSchedulerByStep(_LRScheduler): class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None: def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
self.lr_scheduler = lr_scheduler self.lr_scheduler = lr_scheduler
@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class GradAccumGradientHandler(): class GradAccumGradientHandler():
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param grad_handler: your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None: def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
assert isinstance(grad_handler, BaseGradientHandler), \ assert isinstance(grad_handler, BaseGradientHandler), \

View File

@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
:param message: a prefix message to add in the log :param message: a prefix message to add in the log
:type message: str :type message: str
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`
:param report_cpu: whether to report CPU memory
:type report_cpu: bool
:raises EnvironmentError: raise error if no distributed environment has been initialized :raises EnvironmentError: raise error if no distributed environment has been initialized
''' '''
if not gpc.is_initialized(ParallelMode.GLOBAL): if not gpc.is_initialized(ParallelMode.GLOBAL):

View File

@ -2,6 +2,13 @@
class MultiTensorApply(object): class MultiTensorApply(object):
"""
Apply an operation to a list of tensors efficiently
:param chunk_size: size of a chunk
:type chunk_size: int
"""
available = False available = False
warned = False warned = False

View File

@ -74,6 +74,9 @@ class Timer:
class MultiTimer: class MultiTimer:
'''An object contains multiple timers '''An object contains multiple timers
:param on: whether the timer is enabled. Default is True
:type on: bool
''' '''
def __init__(self, on: bool = True): def __init__(self, on: bool = True):

View File

@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
optimizer: Optimizer, optimizer: Optimizer,
level: int, level: int,
zero_config): zero_config):
"""
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param level: optimizer level, can be 2 or 3
:type level: int
:param zero_config: configuration for zero
:type zero_config: dict
:return: (model, optimizer)
:rtype: Tuple
"""
assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided' assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
if level == 2: if level == 2:
if is_no_pp_or_last_stage(): if is_no_pp_or_last_stage():

View File

@ -1,76 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
IMG_SIZE = 224
BATCH_SIZE = 256
NUM_EPOCHS = 100
model = dict(
type='VanillaResNet',
block_type='ResNetBottleneck',
layers=[3, 4, 6, 3],
num_cls=10
)
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
parallelization = dict(
pipeline=1,
tensor=dict(size=1, mode=None),
)
optimizer = dict(
type='Adam',
lr=0.01
)
loss = dict(
type='CrossEntropyLoss'
)
from colossalai.engine import AMP_TYPE
fp16 = dict(
mode=AMP_TYPE.APEX,
opt_level='O2',
)

View File

@ -1,22 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
NUM_EPOCH = int
model = dict()
train_data = dict()
test_data = dict()
optimizer = dict()
loss = dict()
fp16 = dict()
zero = dict()
gradient_handler = []
parallel = dict()
hooks = []
cudnn_benchmark = True
cudnn_deterministic = False
logging = dict()

View File

@ -1,165 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 2
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
drop_last=True,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
checkpoint=True
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=4,
checkpoint=True
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='TensorboardHook', log_dir='./tb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# for fp16 training
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 8
# )
# only needed when pipeline parallel is used
# schedule = dict(
# num_microbatches=8
# )
logging = dict(
root_path='./logs'
)

View File

@ -1,111 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
from colossalai.context import ParallelMode
from colossalai.engine import AMP_TYPE
try:
import model_zoo
except:
print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
BATCH_SIZE = 512
IMG_SIZE = 32
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
)
)
optimizer = dict(
type='Adam',
lr=0.001
)
loss = dict(
type='CrossEntropyLoss3D',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
)
model = dict(
type='vit_tiny_3d_patch4_32',
drop_rate=0.1,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='LogMemoryByEpochHook'),
dict(
type='Accuracy3DHook',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=8, mode='3d'),
)
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 8
)
logging = dict(
root_path='./logs'
)

View File

@ -77,10 +77,10 @@ fp16 = dict(
) )
``` ```
## Tensor Parallel AMP ## Naive AMP
We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor
and pipeline parallelism. and pipeline parallelism. This AMP mode will cast all operations into fp16.
The following conde block show a config file for this mode. The following conde block show a config file for this mode.

View File

@ -0,0 +1,5 @@
colossalai.amp.apex\_amp
==========================
.. automodule:: colossalai.amp.apex_amp
:members:

View File

@ -0,0 +1,5 @@
colossalai.amp.naive\_amp
==========================
.. automodule:: colossalai.amp.naive_amp
:members:

View File

@ -0,0 +1,13 @@
colossalai.amp
==================
.. toctree::
:maxdepth: 2
colossalai.amp.torch_amp
colossalai.amp.apex_amp
colossalai.amp.naive_amp
.. automodule:: colossalai.amp
:members:

View File

@ -0,0 +1,5 @@
colossalai.amp.torch\_amp
==========================
.. automodule:: colossalai.amp.torch_amp
:members:

View File

@ -1,12 +1,12 @@
colossalai.builder colossalai.builder
================== ==================
.. automodule:: colossalai.builder
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.builder.builder colossalai.builder.builder
colossalai.builder.pipeline colossalai.builder.pipeline
.. automodule:: colossalai.builder
:members:

View File

@ -1,5 +0,0 @@
colossalai.checkpointing
========================
.. automodule:: colossalai.checkpointing
:members:

View File

@ -1,10 +1,6 @@
colossalai.communication colossalai.communication
======================== ========================
.. automodule:: colossalai.communication
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
@ -12,3 +8,7 @@ colossalai.communication
colossalai.communication.p2p colossalai.communication.p2p
colossalai.communication.ring colossalai.communication.ring
colossalai.communication.utils colossalai.communication.utils
.. automodule:: colossalai.communication
:members:

View File

@ -1,11 +1,11 @@
colossalai.context.random colossalai.context.random
========================= =========================
.. automodule:: colossalai.context.random
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.context.random.seed_manager colossalai.context.random.seed_manager
.. automodule:: colossalai.context.random
:members:

View File

@ -1,9 +1,6 @@
colossalai.context colossalai.context
================== ==================
.. automodule:: colossalai.context
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
@ -17,3 +14,7 @@ colossalai.context
colossalai.context.config colossalai.context.config
colossalai.context.parallel_context colossalai.context.parallel_context
colossalai.context.parallel_mode colossalai.context.parallel_mode
.. automodule:: colossalai.context
:members:

View File

@ -1,5 +0,0 @@
colossalai.engine.amp.amp\_type
===============================
.. automodule:: colossalai.engine.amp.amp_type
:members:

View File

@ -1,5 +0,0 @@
colossalai.engine.amp.grad\_scaler
==================================
.. automodule:: colossalai.engine.amp.grad_scaler
:members:

View File

@ -1,12 +0,0 @@
colossalai.engine.amp
=====================
.. automodule:: colossalai.engine.amp
:members:
.. toctree::
:maxdepth: 2
colossalai.engine.amp.amp_type
colossalai.engine.amp.grad_scaler

View File

@ -1,12 +1,12 @@
colossalai.engine colossalai.engine
================= =================
.. automodule:: colossalai.engine
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.engine.amp
colossalai.engine.gradient_handler colossalai.engine.gradient_handler
colossalai.engine.schedule colossalai.engine.schedule
.. automodule:: colossalai.engine
:members:

View File

@ -1,11 +1,11 @@
colossalai.logging colossalai.logging
================== ==================
.. automodule:: colossalai.logging
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.logging.logging colossalai.logging.logging
.. automodule:: colossalai.logging
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.base\_dataset
================================
.. automodule:: colossalai.nn.data.base_dataset
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.caltech101\_dataset
======================================
.. automodule:: colossalai.nn.data.caltech101_dataset
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.cifar10\_dataset
===================================
.. automodule:: colossalai.nn.data.cifar10_dataset
:members:

View File

@ -1,18 +0,0 @@
colossalai.nn.data
==================
.. automodule:: colossalai.nn.data
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.data.sampler
.. toctree::
:maxdepth: 2
colossalai.nn.data.base_dataset
colossalai.nn.data.caltech101_dataset
colossalai.nn.data.cifar10_dataset

View File

@ -1,5 +0,0 @@
colossalai.nn.data.sampler.base\_sampler
========================================
.. automodule:: colossalai.nn.data.sampler.base_sampler
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.sampler.data\_parallel\_sampler
==================================================
.. automodule:: colossalai.nn.data.sampler.data_parallel_sampler
:members:

View File

@ -1,12 +0,0 @@
colossalai.nn.data.sampler
==========================
.. automodule:: colossalai.nn.data.sampler
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.data.sampler.base_sampler
colossalai.nn.data.sampler.data_parallel_sampler

View File

@ -0,0 +1,5 @@
colossalai.nn.layer.non\_parallel\_layers
======================================
.. automodule:: colossalai.nn.layer.non_parallel_layers
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_1d colossalai.nn.layer.parallel\_1d
================================ ================================
.. automodule:: colossalai.nn.layer.parallel_1d
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.layer.parallel_1d.layers colossalai.nn.layer.parallel_1d.layers
.. automodule:: colossalai.nn.layer.parallel_1d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_2d colossalai.nn.layer.parallel\_2d
================================ ================================
.. automodule:: colossalai.nn.layer.parallel_2d
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.layer.parallel_2d.layers colossalai.nn.layer.parallel_2d.layers
.. automodule:: colossalai.nn.layer.parallel_2d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_2p5d colossalai.nn.layer.parallel\_2p5d
================================== ==================================
.. automodule:: colossalai.nn.layer.parallel_2p5d
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.layer.parallel_2p5d.layers colossalai.nn.layer.parallel_2p5d.layers
.. automodule:: colossalai.nn.layer.parallel_2p5d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_3d colossalai.nn.layer.parallel\_3d
================================ ================================
.. automodule:: colossalai.nn.layer.parallel_3d
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.layer.parallel_3d.layers colossalai.nn.layer.parallel_3d.layers
.. automodule:: colossalai.nn.layer.parallel_3d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_sequence colossalai.nn.layer.parallel\_sequence
====================================== ======================================
.. automodule:: colossalai.nn.layer.parallel_sequence
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.layer.parallel_sequence.layers colossalai.nn.layer.parallel_sequence.layers
.. automodule:: colossalai.nn.layer.parallel_sequence
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.parallel\_vision\_transformer.layers
========================================================
.. automodule:: colossalai.nn.layer.parallel_vision_transformer.layers
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.layer.parallel\_vision\_transformer
=================================================
.. automodule:: colossalai.nn.layer.parallel_vision_transformer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_vision_transformer.layers

View File

@ -1,9 +1,6 @@
colossalai.nn.layer colossalai.nn.layer
=================== ===================
.. automodule:: colossalai.nn.layer
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
@ -12,13 +9,10 @@ colossalai.nn.layer
colossalai.nn.layer.parallel_2p5d colossalai.nn.layer.parallel_2p5d
colossalai.nn.layer.parallel_3d colossalai.nn.layer.parallel_3d
colossalai.nn.layer.parallel_sequence colossalai.nn.layer.parallel_sequence
colossalai.nn.layer.parallel_vision_transformer colossalai.nn.layer.non_parallel_layers
colossalai.nn.layer.vanilla_resnet
colossalai.nn.layer.vanilla_vision_transformer
colossalai.nn.layer.wrapper colossalai.nn.layer.wrapper
.. toctree::
:maxdepth: 2
colossalai.nn.layer.base_layer colossalai.nn.layer.base_layer
.. automodule:: colossalai.nn.layer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.basic\_block
================================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.basic_block
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.bottleneck
==============================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.bottleneck
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.conv
========================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.conv
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.reslayer
============================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.reslayer
:members:

View File

@ -1,14 +0,0 @@
colossalai.nn.layer.vanilla\_resnet
===================================
.. automodule:: colossalai.nn.layer.vanilla_resnet
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.vanilla_resnet.basic_block
colossalai.nn.layer.vanilla_resnet.bottleneck
colossalai.nn.layer.vanilla_resnet.conv
colossalai.nn.layer.vanilla_resnet.reslayer

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_vision\_transformer.layers
=======================================================
.. automodule:: colossalai.nn.layer.vanilla_vision_transformer.layers
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.layer.vanilla\_vision\_transformer
================================================
.. automodule:: colossalai.nn.layer.vanilla_vision_transformer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.vanilla_vision_transformer.layers

View File

@ -1,5 +0,0 @@
colossalai.nn.loss.base\_loss
=============================
.. automodule:: colossalai.nn.loss.base_loss
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.loss.cross\_entropy\_1d
=====================================
.. automodule:: colossalai.nn.loss.cross_entropy_1d
:members:

View File

@ -1,15 +1,13 @@
colossalai.nn.loss colossalai.nn.loss
================== ==================
.. automodule:: colossalai.nn.loss
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.loss.base_loss
colossalai.nn.loss.cross_entropy_1d
colossalai.nn.loss.cross_entropy_2d colossalai.nn.loss.cross_entropy_2d
colossalai.nn.loss.cross_entropy_2p5d colossalai.nn.loss.cross_entropy_2p5d
colossalai.nn.loss.cross_entropy_3d colossalai.nn.loss.cross_entropy_3d
.. automodule:: colossalai.nn.loss
:members:

View File

@ -1,10 +1,6 @@
colossalai.nn.lr\_scheduler colossalai.nn.lr\_scheduler
=========================== ===========================
.. automodule:: colossalai.nn.lr_scheduler
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
@ -15,3 +11,7 @@ colossalai.nn.lr\_scheduler
colossalai.nn.lr_scheduler.onecycle colossalai.nn.lr_scheduler.onecycle
colossalai.nn.lr_scheduler.poly colossalai.nn.lr_scheduler.poly
colossalai.nn.lr_scheduler.torch colossalai.nn.lr_scheduler.torch
.. automodule:: colossalai.nn.lr_scheduler
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.model.base\_model
===============================
.. automodule:: colossalai.nn.model.base_model
:members:

View File

@ -0,0 +1,5 @@
colossalai.nn.model.model\_from\_config
===============================
.. automodule:: colossalai.nn.model.model_from_config
:members:

View File

@ -1,17 +1,7 @@
colossalai.nn.model colossalai.nn.model
=================== ===================
.. automodule:: colossalai.nn.model
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.model.vanilla_resnet colossalai.nn.model.model_from_config
colossalai.nn.model.vision_transformer
.. toctree::
:maxdepth: 2
colossalai.nn.model.base_model

View File

@ -1,5 +0,0 @@
colossalai.nn.model.vanilla\_resnet.resnet
==========================================
.. automodule:: colossalai.nn.model.vanilla_resnet.resnet
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.model.vanilla\_resnet
===================================
.. automodule:: colossalai.nn.model.vanilla_resnet
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.model.vanilla_resnet.resnet

View File

@ -1,11 +0,0 @@
colossalai.nn.model.vision\_transformer
=======================================
.. automodule:: colossalai.nn.model.vision_transformer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.model.vision_transformer.vision_transformer

View File

@ -1,5 +0,0 @@
colossalai.nn.model.vision\_transformer.vision\_transformer
===========================================================
.. automodule:: colossalai.nn.model.vision_transformer.vision_transformer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.multi\_tensor\_apply.multi\_tensor\_apply
=======================================================
.. automodule:: colossalai.nn.multi_tensor_apply.multi_tensor_apply
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.multi\_tensor\_apply
==================================
.. automodule:: colossalai.nn.multi_tensor_apply
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.multi_tensor_apply.multi_tensor_apply

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.fp16\_optimizer
=======================================
.. automodule:: colossalai.nn.optimizer.fp16_optimizer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.loss\_scaler
====================================
.. automodule:: colossalai.nn.optimizer.loss_scaler
:members:

View File

@ -1,20 +1,15 @@
colossalai.nn.optimizer colossalai.nn.optimizer
======================= =======================
.. automodule:: colossalai.nn.optimizer
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.optimizer.fp16_optimizer
colossalai.nn.optimizer.fused_adam colossalai.nn.optimizer.fused_adam
colossalai.nn.optimizer.fused_lamb colossalai.nn.optimizer.fused_lamb
colossalai.nn.optimizer.fused_sgd colossalai.nn.optimizer.fused_sgd
colossalai.nn.optimizer.lamb colossalai.nn.optimizer.lamb
colossalai.nn.optimizer.lars colossalai.nn.optimizer.lars
colossalai.nn.optimizer.loss_scaler
colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
colossalai.nn.optimizer.zero_redundancy_optimizer_level_2 .. automodule:: colossalai.nn.optimizer
colossalai.nn.optimizer.zero_redundancy_optimizer_level_3 :members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_1
=============================================================
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_2
=============================================================
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_3
=============================================================
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
:members:

View File

@ -1,16 +1,15 @@
colossalai.nn colossalai.nn
============= =============
.. automodule:: colossalai.nn
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.nn.data
colossalai.nn.layer colossalai.nn.layer
colossalai.nn.loss colossalai.nn.loss
colossalai.nn.lr_scheduler colossalai.nn.lr_scheduler
colossalai.nn.model colossalai.nn.model
colossalai.nn.multi_tensor_apply
colossalai.nn.optimizer colossalai.nn.optimizer
.. automodule:: colossalai.nn
:members:

View File

@ -1,11 +1,11 @@
colossalai.registry colossalai.registry
=================== ===================
.. automodule:: colossalai.registry
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.registry.registry colossalai.registry.registry
.. automodule:: colossalai.registry
:members:

View File

@ -1,12 +1,18 @@
colossalai colossalai
========== ==========
.. automodule:: colossalai
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
colossalai.constants
colossalai.core
colossalai.initialize
.. toctree::
:maxdepth: 2
colossalai.amp
colossalai.builder colossalai.builder
colossalai.communication colossalai.communication
colossalai.context colossalai.context
@ -16,11 +22,7 @@ colossalai
colossalai.registry colossalai.registry
colossalai.trainer colossalai.trainer
colossalai.utils colossalai.utils
colossalai.zero
.. automodule:: colossalai
.. toctree:: :members:
:maxdepth: 2
colossalai.constants
colossalai.core
colossalai.initialize

View File

@ -1,9 +1,6 @@
colossalai.trainer colossalai.trainer
================== ==================
.. automodule:: colossalai.trainer
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
@ -14,3 +11,7 @@ colossalai.trainer
:maxdepth: 2 :maxdepth: 2
colossalai.trainer.metric colossalai.trainer.metric
.. automodule:: colossalai.trainer
:members:

View File

@ -0,0 +1,5 @@
colossalai.utils.data\_sampler
=======================================
.. automodule:: colossalai.utils.data_sampler
:members:

View File

@ -0,0 +1,5 @@
colossalai.utils.gradient\_accumulation
=======================================
.. automodule:: colossalai.utils.gradient_accumulation
:members:

View File

@ -0,0 +1,8 @@
colossalai.nn.multi\_tensor\_apply
==================================
.. automodule:: colossalai.utils.multi_tensor_apply.multi_tensor_apply
:members:

View File

@ -1,10 +1,6 @@
colossalai.utils colossalai.utils
================ ================
.. automodule:: colossalai.utils
:members:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
@ -12,5 +8,12 @@ colossalai.utils
colossalai.utils.checkpointing colossalai.utils.checkpointing
colossalai.utils.common colossalai.utils.common
colossalai.utils.cuda colossalai.utils.cuda
colossalai.utils.data_sampler
colossalai.utils.gradient_accumulation
colossalai.utils.memory colossalai.utils.memory
colossalai.utils.multi_tensor_apply
colossalai.utils.timer colossalai.utils.timer
.. automodule:: colossalai.utils
:members:

View File

@ -0,0 +1,5 @@
colossalai.zero
================
.. automodule:: colossalai.zero
:members:

Some files were not shown because too many files have changed in this diff Show More