update examples and sphnix docs for the new api (#63)

pull/66/head
Frank Lee 2021-12-13 22:07:01 +08:00 committed by GitHub
parent 7d3711058f
commit 35813ed3c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
124 changed files with 1251 additions and 1462 deletions

View File

@ -14,10 +14,12 @@ Blog: [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Trai
pip install colossalai
```
### Install From Source
### Install From Source (Recommended)
> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)
```shell
git clone git@github.com:hpcaitech/ColossalAI.git
git clone https://github.com/hpcaitech/ColossalAI.git
cd ColossalAI
# install dependency
pip install -r requirements/requirements.txt
@ -64,8 +66,8 @@ model = ...
# sampler by default
train_dataset = ...
train_dataloader = get_dataloader(dataset=dataset,
shuffle=True,
)
shuffle=True,
)
# build your

View File

@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
criterion: _Loss,
mode: AMP_TYPE,
amp_config: Config = None):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param criterion: your loss function object
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param mode: amp mode
:type mode: :class:`colossalai.amp.AMP_TYPE`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion)
:rtype: Tuple
"""
assert isinstance(mode, AMP_TYPE), \
f'expected the argument mode be AMP_TYPE, but got {type(mode)}'

View File

@ -7,6 +7,18 @@ import apex.amp as apex_amp
def convert_to_apex_amp(model: nn.Module,
optimizer: Optimizer,
amp_config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param amp_config: configuration for nvidia apex
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple
"""
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
optimizer = ApexAMPOptimizer(optimizer)
return model, optimizer

View File

@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32
class ApexAMPOptimizer(ColossalaiOptimizer):
''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
methods
'''
def backward(self, loss: Tensor):
"""
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
scaled_loss.backward()
def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""
:param model: your model object
:type model: torch.nn.Module
:param max_norm: the max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0:
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)

View File

@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
def convert_to_naive_amp(model: nn.Module,
optimizer: Optimizer,
amp_config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param amp_config: configuration for naive mode amp
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer)
:rtype: Tuple
"""
if is_no_pp_or_last_stage():
model = NaiveAMPModel(model, output_to_fp32=True)
else:

View File

@ -146,26 +146,22 @@ class DynamicGradScaler:
class FP16Optimizer(Optimizer):
"""Float16 optimizer for fp16 and bf16 data types.
Arguments:
optimizer: base optimizer such as Adam or SGD
clip_grad: clip gradeints with this global L2 norm. Note
that clipping is ignored if clip_grad == 0
log_num_zeros_in_grad: return number of zeros in the gradients.
params_have_main_grad: flag indicating if parameters have
a `main_grad` field. If this is set, we are assuming
that the model parameters are store in the `main_grad`
field instead of the typical `grad` field. This happens
for the DDP cases where there is a contihuous buffer
holding the gradients. For example for bfloat16, we want
to do gradient accumulation and all-reduces in float32
and as a result we store those gradients in the main_grad.
Note that main grad is not necessarily in float32.
bf16: if true, the model is running in bfloat16.
grad_scaler: used for scaling gradients. Note that this can be
None. This case happens when `bf16 = True` and we don't
use any loss scale. Note that for `bf16 = True`, we can have
a constnat gradient scaler. Also for `bf16 = False`, we
always require a grad scaler.
:param optimizer: base optimizer such as Adam or SGD
:type optimizer: torch.optim.Optimizer
:param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
:type param clip_grad: float
:param log_num_zeros_in_grad: return number of zeros in the gradients.
:type log_num_zeros_in_grad: bool
:param initial_scale: initial scale of gradient scaler
:type initial_scale: int
:param growth_factor: the growth rate of loss scale
:type growth_factor: int
:param backoff_factor: the decrease rate of loss scale
:type backoff_factor: float
:param hysterisis: delay shift in dynamic loss scaling
:type hysterisis: int
:param max_scale: maximum loss scale allowed
:type max_scale: int
"""
def __init__(self,

View File

@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer
class NaiveAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class for optimizer to cast all parameters to fp16
:param optim: a normal optimizer like Adam or SGD
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs):
optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
super().__init__(optim)
def backward(self, loss: Tensor):
"""backward with gradient scaler
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
loss = self.optim.scale_loss(loss)
loss.backward()
@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
class NaiveAMPModel(nn.Module):
"""A wrapper class for model to cast the model into fp16 and
automatically cast the input and output
"""
def __init__(self,
model: nn.Module,

View File

@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
optimizer: Optimizer,
criterion: _Loss,
amp_config: Config):
"""A helper function to wrap training components with Torch AMP modules
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimzer`
:param criterion: your loss function object
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param amp_config: configuration for different amp modes
:type amp_config: :class:`colossalai.context.Config` or dict
:return: (model, optimizer, criterion)
:rtype: Tuple
"""
model = TorchAMPModel(model)
optimizer = TorchAMPOptimizer(optimizer, **amp_config)
criterion = TorchAMPLoss(criterion)

View File

@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32
class TorchAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class which integrate pytorch amp with an optimizer
:param optim: a normal optimizer like Adam or SGD
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs):
super().__init__(optim)
self.scaler = GradScaler(*args, **kwargs)
def backward(self, loss: Tensor):
"""backward with torch amp gradient scaler
:param loss: loss computed by a loss function
:type loss: torch.Tensor
"""
self.scaler.scale(loss).backward()
def step(self):
"""update the parameters of the model
"""
self.scaler.step(self.optim)
self.scaler.update()
def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""apply gradient clipping to the model parameters
:param model: your model object
:type model: torch.nn.Module
:param max_norm: max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0.0:
self.scaler.unscale_(self.optim)
clip_grad_norm_fp32(model.parameters(), max_norm)
class TorchAMPModel(nn.Module):
"""A wrapper class for a model object which executes forward with values automatically
cast to fp16
"""
def __init__(self, model: nn.Module) -> None:
super().__init__()
@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):
class TorchAMPLoss(nn.Module):
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
:param loss: a loss function object
:type loss: torch.nn.modules.loss._Loss
"""
def __init__(self, loss: _Loss):
super().__init__()
self.loss = loss

View File

@ -16,8 +16,8 @@ def build_from_config(module, config: dict):
of the return object
:type config: dict
:raises AssertionError: Raises an AssertionError if `module` is not a class
:return: An object of :class:`module`
:rtype: :class:`module`
:return: An object of interest
:rtype: Object
"""
assert inspect.isclass(module), 'module must be a class'
return module(**config)
@ -62,8 +62,8 @@ def build_layer(config):
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`nn.Module`
:rtype: :class:`nn.Module`
:return: An object of :class:`torch.nn.Module`
:rtype: :class:`torch.nn.Module`
"""
return build_from_registry(config, LAYERS)
@ -75,8 +75,8 @@ def build_loss(config):
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torch.autograd.Function`
:rtype: :class:`torch.autograd.Function`
:return: An object of :class:`torch.nn.modules.loss._Loss`
:rtype: :class:`torch.nn.modules.loss._Loss`
"""
return build_from_registry(config, LOSSES)
@ -87,8 +87,8 @@ def build_model(config):
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`nn.Module`
:rtype: :class:`nn.Module`
:return: An object of :class:`torch.nn.Module`
:rtype: :class:`torch.nn.Module`
"""
return build_from_registry(config, MODELS)
@ -134,8 +134,8 @@ def build_gradient_handler(config, model, optimizer):
:type model: :class:`nn.Module`
:param optimizer: An optimizer object containing parameters for the gradient handler
:type optimizer: :class:`torch.optim.Optimizer`
:return: An object of :class:`BaseGradientHandler`
:rtype: :class:`BaseGradientHandler`
:return: An object of :class:`colossalai.engine.BaseGradientHandler`
:rtype: :class:`colossalai.engine.BaseGradientHandler`
"""
config_ = config.copy()
config_['model'] = model
@ -151,8 +151,8 @@ def build_hooks(config, trainer):
:type config: dict or :class:`colossalai.context.Config`
:param trainer: A :class:`Trainer` object containing parameters for the hook
:type trainer: :class:`Trainer`
:return: An object of :class:`BaseHook`
:rtype: :class:`BaseHook`
:return: An object of :class:`colossalai.trainer.hooks.BaseHook`
:rtype: :class:`colossalai.trainer.hooks.BaseHook`
"""
config_ = config.copy()
config_['trainer'] = trainer
@ -182,8 +182,8 @@ def build_data_sampler(config, dataset):
:param dataset: An object of :class:`torch.utils.data.Dataset` containing information
used in the construction of the return object
:type dataset: :class:`torch.utils.data.Dataset`
:return: An object of :class:`colossalai.nn.data.sampler.BaseSampler`
:rtype: :class:`colossalai.nn.data.sampler.BaseSampler`
:return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
:rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
"""
config_ = config.copy()
config_['dataset'] = dataset
@ -200,10 +200,6 @@ def build_lr_scheduler(config, optimizer):
:param optimizer: An optimizer object containing parameters for the learning rate
scheduler
:type optimizer: :class:`torch.optim.Optimizer`
:param total_steps: Number of total steps of the learning rate scheduler
:type total_steps: int
:param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
:type num_steps_per_epoch: int
:return: An object of :class:`torch.optim.lr_scheduler`
:rtype: :class:`torch.optim.lr_scheduler`
"""

View File

@ -151,6 +151,28 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):
class PipelineModelInitializer():
"""An intializer to split the model into different stages for pipeline parallelism.
An example for the model config is shown below. The class VisionTransformerFromConfig should
inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
of layer configurations.
model_config = dict(
type='VisionTransformerFromConfig',
embedding_cfg=dict(...),
...
)
:param config: configuration of the model
:type config: dict
:param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
in most cases unless you are using virutal pipeline parallelism.
:type num_chunks: int
:param verbose: whether to print the logs
:type verbose: bool
"""
def __init__(self, config, num_chunks, verbose=False):
self.num_chunks = num_chunks
self.ori_model = build_model(config)
@ -161,6 +183,13 @@ class PipelineModelInitializer():
self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])
def initialize(self, partition_method='parameter'):
"""Initialize the model object from the config passed
:param partition_method: this parameter determines how you want to split your model layers into stages,
you can set it as 'layer' or 'parameter'
:type partition_method: str
"""
# Some space for initializing comunication groups
self._interval = None
self._partition_layers(method=partition_method)
@ -183,7 +212,7 @@ class PipelineModelInitializer():
# print_rank_0(param_counts)
self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
else:
assert method == 'layer', "Method should be a pre-set string"
raise ValueError("Method should be a pre-set string in [layer, parameter]")
# Display the partition
if gpc.get_global_rank() == 0 and self.verbose:

View File

@ -18,11 +18,11 @@ def all_gather(tensor: Tensor, dim: int,
:param tensor: Tensor to be gathered
:param dim: The dimension concatenating in
:param parallel_mode: Parallel group mode used in this communication
:type tensor: Tensor
:type tensor: :class:`torch.Tensor`
:type dim: int
:type parallel_mode: ParallelMode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor generated by all-gather
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
depth = gpc.get_world_size(parallel_mode)
temp = tensor.clone()
@ -54,11 +54,11 @@ def reduce_scatter(tensor: Tensor, dim: int,
:param tensor: Tensor to be reduced and scattered
:param dim: The dimension scattering in
:param parallel_mode: Parallel group mode used in this communication
:type tensor: Tensor
:type tensor: :class:`torch.Tensor`
:type dim: int
:type parallel_mode: ParallelMode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor generated by reduce-scatter
:rtype: Tensor
:rtype: :class:`Tensor`
"""
depth = gpc.get_world_size(parallel_mode)
# temp = list(torch.chunk(tensor, depth, dim=dim))

View File

@ -96,7 +96,7 @@ def recv_forward(input_tensor_shape, prev_rank=None):
:type input_tensor_shape: torch.Size
:type prev_rank: int, optional
:return: The input tensor in forward step
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
if gpc.is_first_rank(ParallelMode.PIPELINE):
input_tensor = None
@ -115,7 +115,7 @@ def recv_backward(output_grad_shape, next_rank=None):
:type output_grad_shape: torch.Size
:type next_rank: int, optional
:return: The grad of output tensor in forward step
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
if gpc.is_last_rank(ParallelMode.PIPELINE):
output_tensor_grad = None
@ -131,7 +131,7 @@ def send_forward(output_tensor, next_rank=None):
:param output_tensor: Tensor to be sent
:param next_rank: The rank of the recipient of the tensor
:type output_tensor: Tensor
:type output_tensor: :class:`torch.Tensor`
:type next_rank: int, optional
"""
if not gpc.is_last_rank(ParallelMode.PIPELINE):
@ -144,7 +144,7 @@ def send_backward(input_tensor_grad, prev_rank=None):
:param input_tensor_grad: Tensor to be sent
:param prev_rank: The rank of the recipient of the tensor
:type input_tensor_grad: Tensor
:type input_tensor_grad: :class:`torch.Tensor`
:type prev_rank: int, optional
"""
if not gpc.is_first_rank(ParallelMode.PIPELINE):
@ -162,10 +162,10 @@ def send_forward_recv_backward(output_tensor,
:param output_tensor: Tensor to be sent
:param output_grad_shape: The shape of the tensor to be recieved
:type output_tensor: Tensor
:type output_grad_shape: torch.Size
:type output_tensor: :class:`torch.Tensor`
:type output_grad_shape: :class:`torch.Size`
:return: The grad of output tensor in forward step
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
if gpc.is_last_rank(ParallelMode.PIPELINE):
output_tensor_grad = None
@ -187,10 +187,10 @@ def send_backward_recv_forward(input_tensor_grad,
:param input_tensor_grad: Tensor to be sent
:param input_tensor_shape: The shape of the tensor to be recieved
:type input_tensor_grad: Tensor
:type input_tensor_shape: torch.Size
:type input_tensor_grad: :class:`torch.Tensor`
:type input_tensor_shape: :class:`torch.Size`
:return: The input tensor in forward step
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
if gpc.is_first_rank(ParallelMode.PIPELINE):
input_tensor = None
@ -213,10 +213,10 @@ def send_forward_recv_forward(output_tensor,
:param output_tensor: Tensor to be sent
:param input_tensor_shape: The shape of the tensor to be recieved
:type output_tensor: Tensor
:type input_tensor_shape: torch.Size
:type output_tensor: :class:`torch.Tensor`
:type input_tensor_shape: :class:`torch.Size`
:return: The input tensor in forward step
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
input_tensor, _ = _communicate(tensor_send_next=output_tensor,
recv_prev=recv_prev,
@ -237,10 +237,10 @@ def send_backward_recv_backward(input_tensor_grad,
:param input_tensor_grad: Tensor to be sent
:param output_grad_shape: The shape of the tensor to be recieved
:type input_tensor_grad: Tensor
:type output_grad_shape: torch.Size
:type input_tensor_grad: :class:`torch.Tensor`
:type output_grad_shape: :class:`torch.Size`
:return: The grad of output tensor in forward step
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
_, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
recv_next=recv_next,
@ -266,10 +266,10 @@ def send_forward_backward_recv_forward_backward(output_tensor,
:param input_tensor_grad: Tensor sent to the previous
:param input_tensor_shape: The shape of the tensor recieved from the previous
:param output_grad_shape: The shape of the tensor recieved from the next
:type output_tensor: Tensor
:type input_tensor_grad: Tensor
:type input_tensor_shape: torch.Size
:type output_grad_shape: torch.Size
:type output_tensor: :class:`torch.Tensor`
:type input_tensor_grad: :class:`torch.Tensor`
:type input_tensor_shape: :class:`torch.Size`
:type output_grad_shape: :class:`torch.Size`
:return: (the input tensor in forward step, the grad of output tensor in forward step)
:rtype: (Tensor, Tensor)
"""

View File

@ -14,10 +14,10 @@ def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
:param tensor_send_next: Tensor sent to next member
:param parallel_mode: Parallel group mode used in this communication
:type tensor_send_next: Tensor
:type parallel_mode: ParallelMode
:type tensor_send_next: :class:`torch.Tensor`
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:return: The tensor recieved from the previous
:rtype: Tensor
:rtype: :class:`torch.Tensor`
"""
buffer_shape = tensor_send_next.size()

View File

@ -433,6 +433,9 @@ class ParallelContext:
def set_device(self, device_ordinal: int = None):
"""Sets distributed processes to be bound to devices.
:param device_ordinal: the device id to be bound to
:type device_ordinal: int
"""
global_rank = self.get_global_rank()
if device_ordinal is None:
@ -445,6 +448,9 @@ class ParallelContext:
def set_seed(self, seed: int):
"""Sets seeds for all random libraries.
:param seed: seed for random states
:type seed: int
"""
random.seed(seed)
np.random.seed(seed)

View File

@ -57,38 +57,61 @@ class Engine:
@property
def model(self):
"""model attached to the engine"""
return self._model
@property
def optimizer(self):
"""optimizer attached to the engine"""
return self._optimizer
@property
def criterion(self):
"""criterion attached to the engine"""
return self._criterion
@property
def schedule(self):
return self._schedule
def zero_grad(self):
"""set the gradient of parameters to zero
"""
self.optimizer.zero_grad()
def step(self):
"""execute parameter update
"""
self._all_reduce_gradients()
self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
self.optimizer.step()
def backward(self, loss: Tensor):
"""Start backward propagation given the loss value computed by a loss function
:param loss: loss value computed by a loss function
:type loss: :class:`torch.Tensor`
"""
return self.optimizer.backward(loss)
def backward_by_grad(self, tensor, grad):
"""Start backward propagation given the gradient of the output tensor
:param loss: output tensor
:type loss: :class:`torch.Tensor`
:param grad: gradient passed back to the output
:type grad: :class:`torch.Tensor`
"""
return self.optimizer.backward_by_grad(tensor, grad)
def calc_loss(self, *args, **kwargs):
"""compute the loss value
:return: the loss value
:rtype: :class:`torch.Tensor`
"""
return self.criterion(*args, **kwargs)
def __call__(self, *args, **kwargs):
"""run the forward step for the model
:return: output the model
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
"""
return self.model(*args, **kwargs)
def _all_reduce_gradients(self):

View File

@ -48,7 +48,7 @@ class BaseSchedule(ABC):
already in the same GPU as where the model's.
:return: (data, label)
:rtype: (Tensor, Tensor)
:rtype: (:class:`Tensor`, :class:`torch.Tensor`)
"""
if data_iter is None:
raise RuntimeError('Dataloader is not defined.')

View File

@ -38,7 +38,9 @@ class NonPipelineSchedule(BaseSchedule):
:type data_iter: Iterator
:type forward_only: bool, optional
:type return_loss: bool, optional
:return: (output, label, loss)
:rtype: Tuple[:class:`torch.Tensor`]
"""
assert forward_only or return_loss, \
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False."

View File

@ -133,6 +133,16 @@ class PipelineSchedule(BaseSchedule):
"""Forward step for passed-in model. If it is the first stage, the input tensor
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
Returns output tensor. This is a helper function and can be ignored by users.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param input_tensor: input tensor for this pipeline stage
:type input_tensor: :class:`torch.Tensor`
:param return_tensors: a list of tensors to return
:type return_tensors: List[:class:`torch.Tensor`]
:return: output or the loss value of the current pipeline stage
:rtype: :class:`torch.Tensor`
"""
if input_tensor is None:
@ -162,6 +172,18 @@ class PipelineSchedule(BaseSchedule):
output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
Returns the gradients with respect to the input tensor (None if first stage).
This is a helper function and can be ignored by users.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param input_tensor: input tensor for this pipeline stage
:type input_tensor: :class:`torch.Tensor`
:param output_tensor: output tensor for this pipeline stage
:type output_tensor: :class:`torch.Tensor`
:param output_tensor_grad: gradient of output tensor for this pipeline stage
:type output_tensor_grad: :class:`torch.Tensor`
:return: gradient of input tensor
:rtype: :class:`torch.Tensor`
"""
# Retain the grad on the input_tensor.
@ -189,7 +211,17 @@ class PipelineSchedule(BaseSchedule):
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
Returns a tuple with losses if the last stage, an empty tuple otherwise.
:param engine: your engine object
:type engine: colossalai.engine.Engine
:param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
:type data_iter: Iterable
:param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
:type forward_only: bool
:param return_loss: whether returns the loss value. Default is true.
:type return_loss: bool
:return: (output, label, loss)
:rtype: Tuple[:class:`torch.Tensor`]
"""
assert forward_only or return_loss, \

View File

@ -82,6 +82,8 @@ def launch(config: Union[str, Path, Config, Dict],
:param local_rank: rank for the process on the node and is used to set the default CUDA device,
defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
:type local_rank: int, optional
:param verbose: whether to print logs
:type verbose: bool
:raises Exception: raise exception when config type is wrong
'''
gpc.verbose = verbose
@ -121,6 +123,20 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
backend: str = 'nccl',
seed: int = 1024,
verbose: bool = True):
'''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
set by SLURM
:param config: config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
rank = int(os.environ['SLURM_PROCID'])
world_size = int(os.environ['SLURM_NPROCS'])
launch(config=config,
@ -139,6 +155,20 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
backend: str = 'nccl',
seed: int = 1024,
verbose: bool = True):
'''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
set by OpenMPI
:param config: config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@ -159,6 +189,20 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
backend: str = 'nccl',
seed: int = 1024,
verbose: bool = True):
'''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch
:param config: config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
@ -184,16 +228,20 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
:param model: your model instance
:type model: a single or a list of ``torch.nn.Module`` objects
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer instance
:type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects
:type optimizer: :class:`torch.optim.optimizer.Optimizer`
:param criterion: your criterion instance
:type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects
:param train_dataloader: dataloaders for training data
:type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
:param train_dataloader: dataloaders for testing data
:type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
:return: (engine, criterion, train_dataloader, test_dataloader)
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param train_dataloader: dataloader for training data
:type train_dataloader: :class:`torch.utils.data.DataLoader`
:param train_dataloader: dataloader for testing data
:type train_dataloader: :class:`torch.utils.data.DataLoader`
:param lr_scheduler: your lr scheduler instance
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
:param verbose: whether to print logs
:type verbose: bool
:return: (engine, train_dataloader, test_dataloader, lr_scheduler)
:rtype: tuple
'''
# get logger

View File

@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
def get_dist_logger(name='root'):
"""Get logger instance based on name. The DistributedLogger will create singleton instances,
which means that only one logger instance is created per name.
:param name: name of the logger, name must be unique
:type name: str
:return: a distributed logger instance
:rtype: :class:`colossalai.logging.DistributedLogger`
"""
return DistributedLogger.get_instance(name=name)

View File

@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
@LAYERS.register_module
class VanillaViTPatchEmbedding(nn.Module):
""" 2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: size of a patch
:type patch_size: int
:param in_chans: input channels
:type in_chans: int
:param embed_dim: embedding dimension
:type embed_dim: int
:param norm_layer: layer norm class, defaults to None
:type norm_layer: Callable
:param flattern: whether flatten the output
:type flatten: bool
:param drop: dropout rate
:type drop: float
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.):
def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
@LAYERS.register_module
class VanillaViTMLP(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
:param in_features: input channels
:type in_features: int
:param hidden_features: channels of the output of the first dense layer
:type hidden_features: int
:param hidden_features: channels of the output of the second dense layer
:type hidden_features: int
:param act_layer: activation function
:type act_layer: Callable
:param drop: dropout rate
:type drop: float
"""
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
:param drop_prob: probability for dropout
:type drop_prob: float
:param training: whether it is training mode
:type training: bool
"""
if drop_prob == 0. or not training:
return x
@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
@LAYERS.register_module
class VanillaViTDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
:param drop_prob: probability for dropout
:type drop_path: float
"""
def __init__(self, drop_prob=0.):
@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
:param dim: dimension of input tensor
:type dim: int
:param num_heads: number of attention heads, defaults to 8
:param num_heads: number of attention heads
:type num_heads: int, optional
:param qkv_bias: enable bias for qkv if True, defaults to False
:type qkv_bias: bool, optional
@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
:type proj_drop: float, optional
"""
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads

View File

@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
added functionality to handle model parallel parameters. Note that
the gradients are modified in place.
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm.
:param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
:type parameters: (Iterable[Tensor] or Tensor)
:param max_norm: max norm of the gradients
:type max_norm: float or int
:param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
:type norm_type: float or int
Returns:
Total norm of the parameters (viewed as a single vector).
:return: Total norm of the parameters (viewed as a single vector).
:rtype: float
"""
if isinstance(parameters, torch.Tensor):

View File

@ -123,12 +123,23 @@ def get_dataloader(dataset,
stage and label on the last stage
:param dataset: a :class:utils.data.dataset dataset
:param shuffle: whether to shuffle the dataset
:param seed: random worker seed, defaults to 1024
:type seed: int, optional
:param add_sampler_if_possible: [description], defaults to False
:type add_sampler_if_possible: bool, optional
:return: a :class:utils.data.dataset dataloader
:rtype: torch.utils.data.dataset
:param add_sampler: add DistributedDataParallelSampelr to the dataset
:param drop_last: drop the last incomplete batch of data
:param pin_memory: whether to pin memory address in CPU memory
:param num_workers: number of worker threads for this dataloader
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: a object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
'''
_kwargs = kwargs.copy()

View File

@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
accumulate_size: int,
gradient_handlers: List[BaseGradientHandler] = None,
lr_scheduler: _LRScheduler = None):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
"""
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

View File

@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
:param model: your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
"""
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
super().__init__(optim)
@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class GradAccumDataloader():
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
self.dataloader = dataloader
@ -99,6 +123,15 @@ class GradAccumDataloader():
class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
self.lr_scheduler = lr_scheduler
@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class GradAccumGradientHandler():
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param grad_handler: your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: the number of steps to accumulate gradients
:type accumualate_size: int
"""
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
assert isinstance(grad_handler, BaseGradientHandler), \

View File

@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
:param message: a prefix message to add in the log
:type message: str
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`
:param report_cpu: whether to report CPU memory
:type report_cpu: bool
:raises EnvironmentError: raise error if no distributed environment has been initialized
'''
if not gpc.is_initialized(ParallelMode.GLOBAL):

View File

@ -2,6 +2,13 @@
class MultiTensorApply(object):
"""
Apply an operation to a list of tensors efficiently
:param chunk_size: size of a chunk
:type chunk_size: int
"""
available = False
warned = False

View File

@ -74,6 +74,9 @@ class Timer:
class MultiTimer:
'''An object contains multiple timers
:param on: whether the timer is enabled. Default is True
:type on: bool
'''
def __init__(self, on: bool = True):

View File

@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
optimizer: Optimizer,
level: int,
zero_config):
"""
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param level: optimizer level, can be 2 or 3
:type level: int
:param zero_config: configuration for zero
:type zero_config: dict
:return: (model, optimizer)
:rtype: Tuple
"""
assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
if level == 2:
if is_no_pp_or_last_stage():

View File

@ -1,76 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
IMG_SIZE = 224
BATCH_SIZE = 256
NUM_EPOCHS = 100
model = dict(
type='VanillaResNet',
block_type='ResNetBottleneck',
layers=[3, 4, 6, 3],
num_cls=10
)
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=os.environ['DATA'],
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
parallelization = dict(
pipeline=1,
tensor=dict(size=1, mode=None),
)
optimizer = dict(
type='Adam',
lr=0.01
)
loss = dict(
type='CrossEntropyLoss'
)
from colossalai.engine import AMP_TYPE
fp16 = dict(
mode=AMP_TYPE.APEX,
opt_level='O2',
)

View File

@ -1,22 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
NUM_EPOCH = int
model = dict()
train_data = dict()
test_data = dict()
optimizer = dict()
loss = dict()
fp16 = dict()
zero = dict()
gradient_handler = []
parallel = dict()
hooks = []
cudnn_benchmark = True
cudnn_deterministic = False
logging = dict()

View File

@ -1,165 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 2
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
drop_last=True,
pin_memory=True,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
)
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
checkpoint=True
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=4,
checkpoint=True
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='Accuracy2DHook'),
dict(type='LossHook'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='TensorboardHook', log_dir='./tb_logs'),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# for fp16 training
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 8
# )
# only needed when pipeline parallel is used
# schedule = dict(
# num_microbatches=8
# )
logging = dict(
root_path='./logs'
)

View File

@ -1,111 +0,0 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
from colossalai.context import ParallelMode
from colossalai.engine import AMP_TYPE
try:
import model_zoo
except:
print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
BATCH_SIZE = 512
IMG_SIZE = 32
NUM_EPOCHS = 60
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
shuffle=True,
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=2,
)
)
optimizer = dict(
type='Adam',
lr=0.001
)
loss = dict(
type='CrossEntropyLoss3D',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
)
model = dict(
type='vit_tiny_3d_patch4_32',
drop_rate=0.1,
)
hooks = [
dict(type='LogMetricByEpochHook'),
dict(type='LogTimingByEpochHook'),
dict(type='LogMemoryByEpochHook'),
dict(
type='Accuracy3DHook',
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
),
dict(type='LossHook'),
dict(type='TensorboardHook', log_dir='./tfb_logs'),
dict(
type='LRSchedulerHook',
by_epoch=True,
lr_scheduler_cfg=dict(
type='LinearWarmupLR',
warmup_steps=5
)
),
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
]
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=8, mode='3d'),
)
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 8
)
logging = dict(
root_path='./logs'
)

View File

@ -77,10 +77,10 @@ fp16 = dict(
)
```
## Tensor Parallel AMP
## Naive AMP
We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor
and pipeline parallelism.
and pipeline parallelism. This AMP mode will cast all operations into fp16.
The following conde block show a config file for this mode.

View File

@ -0,0 +1,5 @@
colossalai.amp.apex\_amp
==========================
.. automodule:: colossalai.amp.apex_amp
:members:

View File

@ -0,0 +1,5 @@
colossalai.amp.naive\_amp
==========================
.. automodule:: colossalai.amp.naive_amp
:members:

View File

@ -0,0 +1,13 @@
colossalai.amp
==================
.. toctree::
:maxdepth: 2
colossalai.amp.torch_amp
colossalai.amp.apex_amp
colossalai.amp.naive_amp
.. automodule:: colossalai.amp
:members:

View File

@ -0,0 +1,5 @@
colossalai.amp.torch\_amp
==========================
.. automodule:: colossalai.amp.torch_amp
:members:

View File

@ -1,12 +1,12 @@
colossalai.builder
==================
.. automodule:: colossalai.builder
:members:
.. toctree::
:maxdepth: 2
colossalai.builder.builder
colossalai.builder.pipeline
.. automodule:: colossalai.builder
:members:

View File

@ -1,5 +0,0 @@
colossalai.checkpointing
========================
.. automodule:: colossalai.checkpointing
:members:

View File

@ -1,10 +1,6 @@
colossalai.communication
========================
.. automodule:: colossalai.communication
:members:
.. toctree::
:maxdepth: 2
@ -12,3 +8,7 @@ colossalai.communication
colossalai.communication.p2p
colossalai.communication.ring
colossalai.communication.utils
.. automodule:: colossalai.communication
:members:

View File

@ -1,11 +1,11 @@
colossalai.context.random
=========================
.. automodule:: colossalai.context.random
:members:
.. toctree::
:maxdepth: 2
colossalai.context.random.seed_manager
.. automodule:: colossalai.context.random
:members:

View File

@ -1,9 +1,6 @@
colossalai.context
==================
.. automodule:: colossalai.context
:members:
.. toctree::
:maxdepth: 2
@ -17,3 +14,7 @@ colossalai.context
colossalai.context.config
colossalai.context.parallel_context
colossalai.context.parallel_mode
.. automodule:: colossalai.context
:members:

View File

@ -1,5 +0,0 @@
colossalai.engine.amp.amp\_type
===============================
.. automodule:: colossalai.engine.amp.amp_type
:members:

View File

@ -1,5 +0,0 @@
colossalai.engine.amp.grad\_scaler
==================================
.. automodule:: colossalai.engine.amp.grad_scaler
:members:

View File

@ -1,12 +0,0 @@
colossalai.engine.amp
=====================
.. automodule:: colossalai.engine.amp
:members:
.. toctree::
:maxdepth: 2
colossalai.engine.amp.amp_type
colossalai.engine.amp.grad_scaler

View File

@ -1,12 +1,12 @@
colossalai.engine
=================
.. automodule:: colossalai.engine
:members:
.. toctree::
:maxdepth: 2
colossalai.engine.amp
colossalai.engine.gradient_handler
colossalai.engine.schedule
.. automodule:: colossalai.engine
:members:

View File

@ -1,11 +1,11 @@
colossalai.logging
==================
.. automodule:: colossalai.logging
:members:
.. toctree::
:maxdepth: 2
colossalai.logging.logging
.. automodule:: colossalai.logging
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.base\_dataset
================================
.. automodule:: colossalai.nn.data.base_dataset
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.caltech101\_dataset
======================================
.. automodule:: colossalai.nn.data.caltech101_dataset
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.cifar10\_dataset
===================================
.. automodule:: colossalai.nn.data.cifar10_dataset
:members:

View File

@ -1,18 +0,0 @@
colossalai.nn.data
==================
.. automodule:: colossalai.nn.data
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.data.sampler
.. toctree::
:maxdepth: 2
colossalai.nn.data.base_dataset
colossalai.nn.data.caltech101_dataset
colossalai.nn.data.cifar10_dataset

View File

@ -1,5 +0,0 @@
colossalai.nn.data.sampler.base\_sampler
========================================
.. automodule:: colossalai.nn.data.sampler.base_sampler
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.data.sampler.data\_parallel\_sampler
==================================================
.. automodule:: colossalai.nn.data.sampler.data_parallel_sampler
:members:

View File

@ -1,12 +0,0 @@
colossalai.nn.data.sampler
==========================
.. automodule:: colossalai.nn.data.sampler
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.data.sampler.base_sampler
colossalai.nn.data.sampler.data_parallel_sampler

View File

@ -0,0 +1,5 @@
colossalai.nn.layer.non\_parallel\_layers
======================================
.. automodule:: colossalai.nn.layer.non_parallel_layers
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_1d
================================
.. automodule:: colossalai.nn.layer.parallel_1d
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_1d.layers
.. automodule:: colossalai.nn.layer.parallel_1d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_2d
================================
.. automodule:: colossalai.nn.layer.parallel_2d
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_2d.layers
.. automodule:: colossalai.nn.layer.parallel_2d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_2p5d
==================================
.. automodule:: colossalai.nn.layer.parallel_2p5d
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_2p5d.layers
.. automodule:: colossalai.nn.layer.parallel_2p5d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_3d
================================
.. automodule:: colossalai.nn.layer.parallel_3d
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_3d.layers
.. automodule:: colossalai.nn.layer.parallel_3d
:members:

View File

@ -1,11 +1,11 @@
colossalai.nn.layer.parallel\_sequence
======================================
.. automodule:: colossalai.nn.layer.parallel_sequence
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_sequence.layers
.. automodule:: colossalai.nn.layer.parallel_sequence
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.parallel\_vision\_transformer.layers
========================================================
.. automodule:: colossalai.nn.layer.parallel_vision_transformer.layers
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.layer.parallel\_vision\_transformer
=================================================
.. automodule:: colossalai.nn.layer.parallel_vision_transformer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.parallel_vision_transformer.layers

View File

@ -1,9 +1,6 @@
colossalai.nn.layer
===================
.. automodule:: colossalai.nn.layer
:members:
.. toctree::
:maxdepth: 2
@ -12,13 +9,10 @@ colossalai.nn.layer
colossalai.nn.layer.parallel_2p5d
colossalai.nn.layer.parallel_3d
colossalai.nn.layer.parallel_sequence
colossalai.nn.layer.parallel_vision_transformer
colossalai.nn.layer.vanilla_resnet
colossalai.nn.layer.vanilla_vision_transformer
colossalai.nn.layer.non_parallel_layers
colossalai.nn.layer.wrapper
.. toctree::
:maxdepth: 2
colossalai.nn.layer.base_layer
.. automodule:: colossalai.nn.layer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.basic\_block
================================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.basic_block
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.bottleneck
==============================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.bottleneck
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.conv
========================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.conv
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_resnet.reslayer
============================================
.. automodule:: colossalai.nn.layer.vanilla_resnet.reslayer
:members:

View File

@ -1,14 +0,0 @@
colossalai.nn.layer.vanilla\_resnet
===================================
.. automodule:: colossalai.nn.layer.vanilla_resnet
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.vanilla_resnet.basic_block
colossalai.nn.layer.vanilla_resnet.bottleneck
colossalai.nn.layer.vanilla_resnet.conv
colossalai.nn.layer.vanilla_resnet.reslayer

View File

@ -1,5 +0,0 @@
colossalai.nn.layer.vanilla\_vision\_transformer.layers
=======================================================
.. automodule:: colossalai.nn.layer.vanilla_vision_transformer.layers
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.layer.vanilla\_vision\_transformer
================================================
.. automodule:: colossalai.nn.layer.vanilla_vision_transformer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.layer.vanilla_vision_transformer.layers

View File

@ -1,5 +0,0 @@
colossalai.nn.loss.base\_loss
=============================
.. automodule:: colossalai.nn.loss.base_loss
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.loss.cross\_entropy\_1d
=====================================
.. automodule:: colossalai.nn.loss.cross_entropy_1d
:members:

View File

@ -1,15 +1,13 @@
colossalai.nn.loss
==================
.. automodule:: colossalai.nn.loss
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.loss.base_loss
colossalai.nn.loss.cross_entropy_1d
colossalai.nn.loss.cross_entropy_2d
colossalai.nn.loss.cross_entropy_2p5d
colossalai.nn.loss.cross_entropy_3d
.. automodule:: colossalai.nn.loss
:members:

View File

@ -1,10 +1,6 @@
colossalai.nn.lr\_scheduler
===========================
.. automodule:: colossalai.nn.lr_scheduler
:members:
.. toctree::
:maxdepth: 2
@ -15,3 +11,7 @@ colossalai.nn.lr\_scheduler
colossalai.nn.lr_scheduler.onecycle
colossalai.nn.lr_scheduler.poly
colossalai.nn.lr_scheduler.torch
.. automodule:: colossalai.nn.lr_scheduler
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.model.base\_model
===============================
.. automodule:: colossalai.nn.model.base_model
:members:

View File

@ -0,0 +1,5 @@
colossalai.nn.model.model\_from\_config
===============================
.. automodule:: colossalai.nn.model.model_from_config
:members:

View File

@ -1,17 +1,7 @@
colossalai.nn.model
===================
.. automodule:: colossalai.nn.model
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.model.vanilla_resnet
colossalai.nn.model.vision_transformer
.. toctree::
:maxdepth: 2
colossalai.nn.model.base_model
colossalai.nn.model.model_from_config

View File

@ -1,5 +0,0 @@
colossalai.nn.model.vanilla\_resnet.resnet
==========================================
.. automodule:: colossalai.nn.model.vanilla_resnet.resnet
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.model.vanilla\_resnet
===================================
.. automodule:: colossalai.nn.model.vanilla_resnet
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.model.vanilla_resnet.resnet

View File

@ -1,11 +0,0 @@
colossalai.nn.model.vision\_transformer
=======================================
.. automodule:: colossalai.nn.model.vision_transformer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.model.vision_transformer.vision_transformer

View File

@ -1,5 +0,0 @@
colossalai.nn.model.vision\_transformer.vision\_transformer
===========================================================
.. automodule:: colossalai.nn.model.vision_transformer.vision_transformer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.multi\_tensor\_apply.multi\_tensor\_apply
=======================================================
.. automodule:: colossalai.nn.multi_tensor_apply.multi_tensor_apply
:members:

View File

@ -1,11 +0,0 @@
colossalai.nn.multi\_tensor\_apply
==================================
.. automodule:: colossalai.nn.multi_tensor_apply
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.multi_tensor_apply.multi_tensor_apply

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.fp16\_optimizer
=======================================
.. automodule:: colossalai.nn.optimizer.fp16_optimizer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.loss\_scaler
====================================
.. automodule:: colossalai.nn.optimizer.loss_scaler
:members:

View File

@ -1,20 +1,15 @@
colossalai.nn.optimizer
=======================
.. automodule:: colossalai.nn.optimizer
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.optimizer.fp16_optimizer
colossalai.nn.optimizer.fused_adam
colossalai.nn.optimizer.fused_lamb
colossalai.nn.optimizer.fused_sgd
colossalai.nn.optimizer.lamb
colossalai.nn.optimizer.lars
colossalai.nn.optimizer.loss_scaler
colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
.. automodule:: colossalai.nn.optimizer
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_1
=============================================================
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_2
=============================================================
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
:members:

View File

@ -1,5 +0,0 @@
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_3
=============================================================
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
:members:

View File

@ -1,16 +1,15 @@
colossalai.nn
=============
.. automodule:: colossalai.nn
:members:
.. toctree::
:maxdepth: 2
colossalai.nn.data
colossalai.nn.layer
colossalai.nn.loss
colossalai.nn.lr_scheduler
colossalai.nn.model
colossalai.nn.multi_tensor_apply
colossalai.nn.optimizer
.. automodule:: colossalai.nn
:members:

View File

@ -1,11 +1,11 @@
colossalai.registry
===================
.. automodule:: colossalai.registry
:members:
.. toctree::
:maxdepth: 2
colossalai.registry.registry
.. automodule:: colossalai.registry
:members:

View File

@ -1,12 +1,18 @@
colossalai
==========
.. automodule:: colossalai
:members:
.. toctree::
:maxdepth: 2
colossalai.constants
colossalai.core
colossalai.initialize
.. toctree::
:maxdepth: 2
colossalai.amp
colossalai.builder
colossalai.communication
colossalai.context
@ -16,11 +22,7 @@ colossalai
colossalai.registry
colossalai.trainer
colossalai.utils
colossalai.zero
.. toctree::
:maxdepth: 2
colossalai.constants
colossalai.core
colossalai.initialize
.. automodule:: colossalai
:members:

View File

@ -1,9 +1,6 @@
colossalai.trainer
==================
.. automodule:: colossalai.trainer
:members:
.. toctree::
:maxdepth: 2
@ -14,3 +11,7 @@ colossalai.trainer
:maxdepth: 2
colossalai.trainer.metric
.. automodule:: colossalai.trainer
:members:

View File

@ -0,0 +1,5 @@
colossalai.utils.data\_sampler
=======================================
.. automodule:: colossalai.utils.data_sampler
:members:

View File

@ -0,0 +1,5 @@
colossalai.utils.gradient\_accumulation
=======================================
.. automodule:: colossalai.utils.gradient_accumulation
:members:

View File

@ -0,0 +1,8 @@
colossalai.nn.multi\_tensor\_apply
==================================
.. automodule:: colossalai.utils.multi_tensor_apply.multi_tensor_apply
:members:

View File

@ -1,10 +1,6 @@
colossalai.utils
================
.. automodule:: colossalai.utils
:members:
.. toctree::
:maxdepth: 2
@ -12,5 +8,12 @@ colossalai.utils
colossalai.utils.checkpointing
colossalai.utils.common
colossalai.utils.cuda
colossalai.utils.data_sampler
colossalai.utils.gradient_accumulation
colossalai.utils.memory
colossalai.utils.multi_tensor_apply
colossalai.utils.timer
.. automodule:: colossalai.utils
:members:

View File

@ -0,0 +1,5 @@
colossalai.zero
================
.. automodule:: colossalai.zero
:members:

Some files were not shown because too many files have changed in this diff Show More