mirror of https://github.com/hpcaitech/ColossalAI
update examples and sphnix docs for the new api (#63)
parent
7d3711058f
commit
35813ed3c4
10
README.md
10
README.md
|
@ -14,10 +14,12 @@ Blog: [Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Trai
|
|||
pip install colossalai
|
||||
```
|
||||
|
||||
### Install From Source
|
||||
### Install From Source (Recommended)
|
||||
|
||||
> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :)
|
||||
|
||||
```shell
|
||||
git clone git@github.com:hpcaitech/ColossalAI.git
|
||||
git clone https://github.com/hpcaitech/ColossalAI.git
|
||||
cd ColossalAI
|
||||
# install dependency
|
||||
pip install -r requirements/requirements.txt
|
||||
|
@ -64,8 +66,8 @@ model = ...
|
|||
# sampler by default
|
||||
train_dataset = ...
|
||||
train_dataloader = get_dataloader(dataset=dataset,
|
||||
shuffle=True,
|
||||
)
|
||||
shuffle=True,
|
||||
)
|
||||
|
||||
|
||||
# build your
|
||||
|
|
|
@ -16,6 +16,22 @@ def convert_to_amp(model: nn.Module,
|
|||
criterion: _Loss,
|
||||
mode: AMP_TYPE,
|
||||
amp_config: Config = None):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param criterion: your loss function object
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`
|
||||
:param mode: amp mode
|
||||
:type mode: :class:`colossalai.amp.AMP_TYPE`
|
||||
:param amp_config: configuration for different amp modes
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer, criterion)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
assert isinstance(mode, AMP_TYPE), \
|
||||
f'expected the argument mode be AMP_TYPE, but got {type(mode)}'
|
||||
|
||||
|
|
|
@ -7,6 +7,18 @@ import apex.amp as apex_amp
|
|||
def convert_to_apex_amp(model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
amp_config):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param amp_config: configuration for nvidia apex
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
model, optimizer = apex_amp.initialize(model, optimizer, **amp_config)
|
||||
optimizer = ApexAMPOptimizer(optimizer)
|
||||
return model, optimizer
|
||||
|
|
|
@ -13,11 +13,24 @@ from colossalai.utils import clip_grad_norm_fp32
|
|||
|
||||
|
||||
class ApexAMPOptimizer(ColossalaiOptimizer):
|
||||
''' A wrapper class for APEX optimizer and it implements apex-specific backward and clip_grad_norm
|
||||
methods
|
||||
'''
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""
|
||||
:param loss: loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
with apex_amp.scale_loss(loss, self.optim) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
|
||||
def clip_grad_norm(self, model: nn.Module, max_norm: float):
|
||||
"""
|
||||
:param model: your model object
|
||||
:type model: torch.nn.Module
|
||||
:param max_norm: the max norm value for gradient clipping
|
||||
:type max_norm: float
|
||||
"""
|
||||
if max_norm > 0:
|
||||
clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm)
|
||||
|
|
|
@ -8,6 +8,18 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
|
|||
def convert_to_naive_amp(model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
amp_config):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param amp_config: configuration for naive mode amp
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
if is_no_pp_or_last_stage():
|
||||
model = NaiveAMPModel(model, output_to_fp32=True)
|
||||
else:
|
||||
|
|
|
@ -146,26 +146,22 @@ class DynamicGradScaler:
|
|||
class FP16Optimizer(Optimizer):
|
||||
"""Float16 optimizer for fp16 and bf16 data types.
|
||||
|
||||
Arguments:
|
||||
optimizer: base optimizer such as Adam or SGD
|
||||
clip_grad: clip gradeints with this global L2 norm. Note
|
||||
that clipping is ignored if clip_grad == 0
|
||||
log_num_zeros_in_grad: return number of zeros in the gradients.
|
||||
params_have_main_grad: flag indicating if parameters have
|
||||
a `main_grad` field. If this is set, we are assuming
|
||||
that the model parameters are store in the `main_grad`
|
||||
field instead of the typical `grad` field. This happens
|
||||
for the DDP cases where there is a contihuous buffer
|
||||
holding the gradients. For example for bfloat16, we want
|
||||
to do gradient accumulation and all-reduces in float32
|
||||
and as a result we store those gradients in the main_grad.
|
||||
Note that main grad is not necessarily in float32.
|
||||
bf16: if true, the model is running in bfloat16.
|
||||
grad_scaler: used for scaling gradients. Note that this can be
|
||||
None. This case happens when `bf16 = True` and we don't
|
||||
use any loss scale. Note that for `bf16 = True`, we can have
|
||||
a constnat gradient scaler. Also for `bf16 = False`, we
|
||||
always require a grad scaler.
|
||||
:param optimizer: base optimizer such as Adam or SGD
|
||||
:type optimizer: torch.optim.Optimizer
|
||||
:param clip_grad: clip gradeints with this global L2 norm. Note that clipping is ignored if clip_grad == 0
|
||||
:type param clip_grad: float
|
||||
:param log_num_zeros_in_grad: return number of zeros in the gradients.
|
||||
:type log_num_zeros_in_grad: bool
|
||||
:param initial_scale: initial scale of gradient scaler
|
||||
:type initial_scale: int
|
||||
:param growth_factor: the growth rate of loss scale
|
||||
:type growth_factor: int
|
||||
:param backoff_factor: the decrease rate of loss scale
|
||||
:type backoff_factor: float
|
||||
:param hysterisis: delay shift in dynamic loss scaling
|
||||
:type hysterisis: int
|
||||
:param max_scale: maximum loss scale allowed
|
||||
:type max_scale: int
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
|
|
@ -13,12 +13,21 @@ from ._fp16_optimizer import FP16Optimizer
|
|||
|
||||
|
||||
class NaiveAMPOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper class for optimizer to cast all parameters to fp16
|
||||
|
||||
:param optim: a normal optimizer like Adam or SGD
|
||||
:type optim: torch.optim.Optimizer
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, *args, **kwargs):
|
||||
optim = FP16Optimizer(optimizer=optim, *args, **kwargs)
|
||||
super().__init__(optim)
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""backward with gradient scaler
|
||||
:param loss: loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
loss = self.optim.scale_loss(loss)
|
||||
loss.backward()
|
||||
|
||||
|
@ -30,6 +39,9 @@ class NaiveAMPOptimizer(ColossalaiOptimizer):
|
|||
|
||||
|
||||
class NaiveAMPModel(nn.Module):
|
||||
"""A wrapper class for model to cast the model into fp16 and
|
||||
automatically cast the input and output
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model: nn.Module,
|
||||
|
|
|
@ -9,6 +9,20 @@ def convert_to_torch_amp(model: nn.Module,
|
|||
optimizer: Optimizer,
|
||||
criterion: _Loss,
|
||||
amp_config: Config):
|
||||
"""A helper function to wrap training components with Torch AMP modules
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimzer`
|
||||
:param criterion: your loss function object
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`
|
||||
:param amp_config: configuration for different amp modes
|
||||
:type amp_config: :class:`colossalai.context.Config` or dict
|
||||
|
||||
:return: (model, optimizer, criterion)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
model = TorchAMPModel(model)
|
||||
optimizer = TorchAMPOptimizer(optimizer, **amp_config)
|
||||
criterion = TorchAMPLoss(criterion)
|
||||
|
|
|
@ -14,25 +14,45 @@ from colossalai.utils import clip_grad_norm_fp32
|
|||
|
||||
|
||||
class TorchAMPOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper class which integrate pytorch amp with an optimizer
|
||||
|
||||
:param optim: a normal optimizer like Adam or SGD
|
||||
:type optim: torch.optim.Optimizer
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, *args, **kwargs):
|
||||
super().__init__(optim)
|
||||
self.scaler = GradScaler(*args, **kwargs)
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""backward with torch amp gradient scaler
|
||||
:param loss: loss computed by a loss function
|
||||
:type loss: torch.Tensor
|
||||
"""
|
||||
self.scaler.scale(loss).backward()
|
||||
|
||||
def step(self):
|
||||
"""update the parameters of the model
|
||||
"""
|
||||
self.scaler.step(self.optim)
|
||||
self.scaler.update()
|
||||
|
||||
def clip_grad_norm(self, model: nn.Module, max_norm: float):
|
||||
"""apply gradient clipping to the model parameters
|
||||
:param model: your model object
|
||||
:type model: torch.nn.Module
|
||||
:param max_norm: max norm value for gradient clipping
|
||||
:type max_norm: float
|
||||
"""
|
||||
if max_norm > 0.0:
|
||||
self.scaler.unscale_(self.optim)
|
||||
clip_grad_norm_fp32(model.parameters(), max_norm)
|
||||
|
||||
|
||||
class TorchAMPModel(nn.Module):
|
||||
"""A wrapper class for a model object which executes forward with values automatically
|
||||
cast to fp16
|
||||
"""
|
||||
|
||||
def __init__(self, model: nn.Module) -> None:
|
||||
super().__init__()
|
||||
|
@ -44,7 +64,10 @@ class TorchAMPModel(nn.Module):
|
|||
|
||||
|
||||
class TorchAMPLoss(nn.Module):
|
||||
|
||||
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
|
||||
:param loss: a loss function object
|
||||
:type loss: torch.nn.modules.loss._Loss
|
||||
"""
|
||||
def __init__(self, loss: _Loss):
|
||||
super().__init__()
|
||||
self.loss = loss
|
||||
|
|
|
@ -16,8 +16,8 @@ def build_from_config(module, config: dict):
|
|||
of the return object
|
||||
:type config: dict
|
||||
:raises AssertionError: Raises an AssertionError if `module` is not a class
|
||||
:return: An object of :class:`module`
|
||||
:rtype: :class:`module`
|
||||
:return: An object of interest
|
||||
:rtype: Object
|
||||
"""
|
||||
assert inspect.isclass(module), 'module must be a class'
|
||||
return module(**config)
|
||||
|
@ -62,8 +62,8 @@ def build_layer(config):
|
|||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`nn.Module`
|
||||
:rtype: :class:`nn.Module`
|
||||
:return: An object of :class:`torch.nn.Module`
|
||||
:rtype: :class:`torch.nn.Module`
|
||||
"""
|
||||
return build_from_registry(config, LAYERS)
|
||||
|
||||
|
@ -75,8 +75,8 @@ def build_loss(config):
|
|||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`torch.autograd.Function`
|
||||
:rtype: :class:`torch.autograd.Function`
|
||||
:return: An object of :class:`torch.nn.modules.loss._Loss`
|
||||
:rtype: :class:`torch.nn.modules.loss._Loss`
|
||||
"""
|
||||
return build_from_registry(config, LOSSES)
|
||||
|
||||
|
@ -87,8 +87,8 @@ def build_model(config):
|
|||
:param config: A python dict or a :class:`colossalai.context.Config` object
|
||||
containing information used in the construction of the return object
|
||||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:return: An object of :class:`nn.Module`
|
||||
:rtype: :class:`nn.Module`
|
||||
:return: An object of :class:`torch.nn.Module`
|
||||
:rtype: :class:`torch.nn.Module`
|
||||
"""
|
||||
return build_from_registry(config, MODELS)
|
||||
|
||||
|
@ -134,8 +134,8 @@ def build_gradient_handler(config, model, optimizer):
|
|||
:type model: :class:`nn.Module`
|
||||
:param optimizer: An optimizer object containing parameters for the gradient handler
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:return: An object of :class:`BaseGradientHandler`
|
||||
:rtype: :class:`BaseGradientHandler`
|
||||
:return: An object of :class:`colossalai.engine.BaseGradientHandler`
|
||||
:rtype: :class:`colossalai.engine.BaseGradientHandler`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['model'] = model
|
||||
|
@ -151,8 +151,8 @@ def build_hooks(config, trainer):
|
|||
:type config: dict or :class:`colossalai.context.Config`
|
||||
:param trainer: A :class:`Trainer` object containing parameters for the hook
|
||||
:type trainer: :class:`Trainer`
|
||||
:return: An object of :class:`BaseHook`
|
||||
:rtype: :class:`BaseHook`
|
||||
:return: An object of :class:`colossalai.trainer.hooks.BaseHook`
|
||||
:rtype: :class:`colossalai.trainer.hooks.BaseHook`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['trainer'] = trainer
|
||||
|
@ -182,8 +182,8 @@ def build_data_sampler(config, dataset):
|
|||
:param dataset: An object of :class:`torch.utils.data.Dataset` containing information
|
||||
used in the construction of the return object
|
||||
:type dataset: :class:`torch.utils.data.Dataset`
|
||||
:return: An object of :class:`colossalai.nn.data.sampler.BaseSampler`
|
||||
:rtype: :class:`colossalai.nn.data.sampler.BaseSampler`
|
||||
:return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
|
||||
:rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
|
||||
"""
|
||||
config_ = config.copy()
|
||||
config_['dataset'] = dataset
|
||||
|
@ -200,10 +200,6 @@ def build_lr_scheduler(config, optimizer):
|
|||
:param optimizer: An optimizer object containing parameters for the learning rate
|
||||
scheduler
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param total_steps: Number of total steps of the learning rate scheduler
|
||||
:type total_steps: int
|
||||
:param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
|
||||
:type num_steps_per_epoch: int
|
||||
:return: An object of :class:`torch.optim.lr_scheduler`
|
||||
:rtype: :class:`torch.optim.lr_scheduler`
|
||||
"""
|
||||
|
|
|
@ -151,6 +151,28 @@ def _partition_balanced(weights, pipeline_parallel_size, num_chunks):
|
|||
|
||||
|
||||
class PipelineModelInitializer():
|
||||
"""An intializer to split the model into different stages for pipeline parallelism.
|
||||
|
||||
An example for the model config is shown below. The class VisionTransformerFromConfig should
|
||||
inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
|
||||
of layer configurations.
|
||||
|
||||
model_config = dict(
|
||||
type='VisionTransformerFromConfig',
|
||||
embedding_cfg=dict(...),
|
||||
...
|
||||
)
|
||||
|
||||
:param config: configuration of the model
|
||||
:type config: dict
|
||||
:param num_chunks: the number of chunks you want to have on the current stage. This value should be 1
|
||||
in most cases unless you are using virutal pipeline parallelism.
|
||||
:type num_chunks: int
|
||||
:param verbose: whether to print the logs
|
||||
:type verbose: bool
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config, num_chunks, verbose=False):
|
||||
self.num_chunks = num_chunks
|
||||
self.ori_model = build_model(config)
|
||||
|
@ -161,6 +183,13 @@ class PipelineModelInitializer():
|
|||
self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])
|
||||
|
||||
def initialize(self, partition_method='parameter'):
|
||||
"""Initialize the model object from the config passed
|
||||
|
||||
:param partition_method: this parameter determines how you want to split your model layers into stages,
|
||||
you can set it as 'layer' or 'parameter'
|
||||
:type partition_method: str
|
||||
|
||||
"""
|
||||
# Some space for initializing comunication groups
|
||||
self._interval = None
|
||||
self._partition_layers(method=partition_method)
|
||||
|
@ -183,7 +212,7 @@ class PipelineModelInitializer():
|
|||
# print_rank_0(param_counts)
|
||||
self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
|
||||
else:
|
||||
assert method == 'layer', "Method should be a pre-set string"
|
||||
raise ValueError("Method should be a pre-set string in [layer, parameter]")
|
||||
|
||||
# Display the partition
|
||||
if gpc.get_global_rank() == 0 and self.verbose:
|
||||
|
|
|
@ -18,11 +18,11 @@ def all_gather(tensor: Tensor, dim: int,
|
|||
:param tensor: Tensor to be gathered
|
||||
:param dim: The dimension concatenating in
|
||||
:param parallel_mode: Parallel group mode used in this communication
|
||||
:type tensor: Tensor
|
||||
:type tensor: :class:`torch.Tensor`
|
||||
:type dim: int
|
||||
:type parallel_mode: ParallelMode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:return: The tensor generated by all-gather
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
temp = tensor.clone()
|
||||
|
@ -54,11 +54,11 @@ def reduce_scatter(tensor: Tensor, dim: int,
|
|||
:param tensor: Tensor to be reduced and scattered
|
||||
:param dim: The dimension scattering in
|
||||
:param parallel_mode: Parallel group mode used in this communication
|
||||
:type tensor: Tensor
|
||||
:type tensor: :class:`torch.Tensor`
|
||||
:type dim: int
|
||||
:type parallel_mode: ParallelMode
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:return: The tensor generated by reduce-scatter
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`Tensor`
|
||||
"""
|
||||
depth = gpc.get_world_size(parallel_mode)
|
||||
# temp = list(torch.chunk(tensor, depth, dim=dim))
|
||||
|
|
|
@ -96,7 +96,7 @@ def recv_forward(input_tensor_shape, prev_rank=None):
|
|||
:type input_tensor_shape: torch.Size
|
||||
:type prev_rank: int, optional
|
||||
:return: The input tensor in forward step
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
if gpc.is_first_rank(ParallelMode.PIPELINE):
|
||||
input_tensor = None
|
||||
|
@ -115,7 +115,7 @@ def recv_backward(output_grad_shape, next_rank=None):
|
|||
:type output_grad_shape: torch.Size
|
||||
:type next_rank: int, optional
|
||||
:return: The grad of output tensor in forward step
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
if gpc.is_last_rank(ParallelMode.PIPELINE):
|
||||
output_tensor_grad = None
|
||||
|
@ -131,7 +131,7 @@ def send_forward(output_tensor, next_rank=None):
|
|||
|
||||
:param output_tensor: Tensor to be sent
|
||||
:param next_rank: The rank of the recipient of the tensor
|
||||
:type output_tensor: Tensor
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type next_rank: int, optional
|
||||
"""
|
||||
if not gpc.is_last_rank(ParallelMode.PIPELINE):
|
||||
|
@ -144,7 +144,7 @@ def send_backward(input_tensor_grad, prev_rank=None):
|
|||
|
||||
:param input_tensor_grad: Tensor to be sent
|
||||
:param prev_rank: The rank of the recipient of the tensor
|
||||
:type input_tensor_grad: Tensor
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type prev_rank: int, optional
|
||||
"""
|
||||
if not gpc.is_first_rank(ParallelMode.PIPELINE):
|
||||
|
@ -162,10 +162,10 @@ def send_forward_recv_backward(output_tensor,
|
|||
|
||||
:param output_tensor: Tensor to be sent
|
||||
:param output_grad_shape: The shape of the tensor to be recieved
|
||||
:type output_tensor: Tensor
|
||||
:type output_grad_shape: torch.Size
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type output_grad_shape: :class:`torch.Size`
|
||||
:return: The grad of output tensor in forward step
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
if gpc.is_last_rank(ParallelMode.PIPELINE):
|
||||
output_tensor_grad = None
|
||||
|
@ -187,10 +187,10 @@ def send_backward_recv_forward(input_tensor_grad,
|
|||
|
||||
:param input_tensor_grad: Tensor to be sent
|
||||
:param input_tensor_shape: The shape of the tensor to be recieved
|
||||
:type input_tensor_grad: Tensor
|
||||
:type input_tensor_shape: torch.Size
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type input_tensor_shape: :class:`torch.Size`
|
||||
:return: The input tensor in forward step
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
if gpc.is_first_rank(ParallelMode.PIPELINE):
|
||||
input_tensor = None
|
||||
|
@ -213,10 +213,10 @@ def send_forward_recv_forward(output_tensor,
|
|||
|
||||
:param output_tensor: Tensor to be sent
|
||||
:param input_tensor_shape: The shape of the tensor to be recieved
|
||||
:type output_tensor: Tensor
|
||||
:type input_tensor_shape: torch.Size
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type input_tensor_shape: :class:`torch.Size`
|
||||
:return: The input tensor in forward step
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
input_tensor, _ = _communicate(tensor_send_next=output_tensor,
|
||||
recv_prev=recv_prev,
|
||||
|
@ -237,10 +237,10 @@ def send_backward_recv_backward(input_tensor_grad,
|
|||
|
||||
:param input_tensor_grad: Tensor to be sent
|
||||
:param output_grad_shape: The shape of the tensor to be recieved
|
||||
:type input_tensor_grad: Tensor
|
||||
:type output_grad_shape: torch.Size
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type output_grad_shape: :class:`torch.Size`
|
||||
:return: The grad of output tensor in forward step
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
_, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
|
||||
recv_next=recv_next,
|
||||
|
@ -266,10 +266,10 @@ def send_forward_backward_recv_forward_backward(output_tensor,
|
|||
:param input_tensor_grad: Tensor sent to the previous
|
||||
:param input_tensor_shape: The shape of the tensor recieved from the previous
|
||||
:param output_grad_shape: The shape of the tensor recieved from the next
|
||||
:type output_tensor: Tensor
|
||||
:type input_tensor_grad: Tensor
|
||||
:type input_tensor_shape: torch.Size
|
||||
:type output_grad_shape: torch.Size
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:type input_tensor_grad: :class:`torch.Tensor`
|
||||
:type input_tensor_shape: :class:`torch.Size`
|
||||
:type output_grad_shape: :class:`torch.Size`
|
||||
:return: (the input tensor in forward step, the grad of output tensor in forward step)
|
||||
:rtype: (Tensor, Tensor)
|
||||
"""
|
||||
|
|
|
@ -14,10 +14,10 @@ def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
|
|||
|
||||
:param tensor_send_next: Tensor sent to next member
|
||||
:param parallel_mode: Parallel group mode used in this communication
|
||||
:type tensor_send_next: Tensor
|
||||
:type parallel_mode: ParallelMode
|
||||
:type tensor_send_next: :class:`torch.Tensor`
|
||||
:type parallel_mode: :class:`colossalai.context.ParallelMode`
|
||||
:return: The tensor recieved from the previous
|
||||
:rtype: Tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
buffer_shape = tensor_send_next.size()
|
||||
|
||||
|
|
|
@ -433,6 +433,9 @@ class ParallelContext:
|
|||
|
||||
def set_device(self, device_ordinal: int = None):
|
||||
"""Sets distributed processes to be bound to devices.
|
||||
|
||||
:param device_ordinal: the device id to be bound to
|
||||
:type device_ordinal: int
|
||||
"""
|
||||
global_rank = self.get_global_rank()
|
||||
if device_ordinal is None:
|
||||
|
@ -445,6 +448,9 @@ class ParallelContext:
|
|||
|
||||
def set_seed(self, seed: int):
|
||||
"""Sets seeds for all random libraries.
|
||||
|
||||
:param seed: seed for random states
|
||||
:type seed: int
|
||||
"""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
|
|
@ -57,38 +57,61 @@ class Engine:
|
|||
|
||||
@property
|
||||
def model(self):
|
||||
"""model attached to the engine"""
|
||||
return self._model
|
||||
|
||||
@property
|
||||
def optimizer(self):
|
||||
"""optimizer attached to the engine"""
|
||||
return self._optimizer
|
||||
|
||||
@property
|
||||
def criterion(self):
|
||||
"""criterion attached to the engine"""
|
||||
return self._criterion
|
||||
|
||||
@property
|
||||
def schedule(self):
|
||||
return self._schedule
|
||||
|
||||
def zero_grad(self):
|
||||
"""set the gradient of parameters to zero
|
||||
"""
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
def step(self):
|
||||
"""execute parameter update
|
||||
"""
|
||||
self._all_reduce_gradients()
|
||||
self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
|
||||
self.optimizer.step()
|
||||
|
||||
def backward(self, loss: Tensor):
|
||||
"""Start backward propagation given the loss value computed by a loss function
|
||||
|
||||
:param loss: loss value computed by a loss function
|
||||
:type loss: :class:`torch.Tensor`
|
||||
"""
|
||||
return self.optimizer.backward(loss)
|
||||
|
||||
def backward_by_grad(self, tensor, grad):
|
||||
"""Start backward propagation given the gradient of the output tensor
|
||||
|
||||
:param loss: output tensor
|
||||
:type loss: :class:`torch.Tensor`
|
||||
:param grad: gradient passed back to the output
|
||||
:type grad: :class:`torch.Tensor`
|
||||
"""
|
||||
return self.optimizer.backward_by_grad(tensor, grad)
|
||||
|
||||
def calc_loss(self, *args, **kwargs):
|
||||
"""compute the loss value
|
||||
:return: the loss value
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
return self.criterion(*args, **kwargs)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""run the forward step for the model
|
||||
:return: output the model
|
||||
:rtype: Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`
|
||||
"""
|
||||
return self.model(*args, **kwargs)
|
||||
|
||||
def _all_reduce_gradients(self):
|
||||
|
|
|
@ -48,7 +48,7 @@ class BaseSchedule(ABC):
|
|||
already in the same GPU as where the model's.
|
||||
|
||||
:return: (data, label)
|
||||
:rtype: (Tensor, Tensor)
|
||||
:rtype: (:class:`Tensor`, :class:`torch.Tensor`)
|
||||
"""
|
||||
if data_iter is None:
|
||||
raise RuntimeError('Dataloader is not defined.')
|
||||
|
|
|
@ -38,7 +38,9 @@ class NonPipelineSchedule(BaseSchedule):
|
|||
:type data_iter: Iterator
|
||||
:type forward_only: bool, optional
|
||||
:type return_loss: bool, optional
|
||||
|
||||
:return: (output, label, loss)
|
||||
:rtype: Tuple[:class:`torch.Tensor`]
|
||||
"""
|
||||
assert forward_only or return_loss, \
|
||||
"The argument 'return_loss' has to be True when 'forward_only' is False, but got False."
|
||||
|
|
|
@ -133,6 +133,16 @@ class PipelineSchedule(BaseSchedule):
|
|||
"""Forward step for passed-in model. If it is the first stage, the input tensor
|
||||
is obtained from data_iterator, otherwise the passed-in input_tensor is used.
|
||||
Returns output tensor. This is a helper function and can be ignored by users.
|
||||
|
||||
:param engine: your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param input_tensor: input tensor for this pipeline stage
|
||||
:type input_tensor: :class:`torch.Tensor`
|
||||
:param return_tensors: a list of tensors to return
|
||||
:type return_tensors: List[:class:`torch.Tensor`]
|
||||
|
||||
:return: output or the loss value of the current pipeline stage
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
|
||||
if input_tensor is None:
|
||||
|
@ -162,6 +172,18 @@ class PipelineSchedule(BaseSchedule):
|
|||
output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
|
||||
Returns the gradients with respect to the input tensor (None if first stage).
|
||||
This is a helper function and can be ignored by users.
|
||||
|
||||
:param engine: your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param input_tensor: input tensor for this pipeline stage
|
||||
:type input_tensor: :class:`torch.Tensor`
|
||||
:param output_tensor: output tensor for this pipeline stage
|
||||
:type output_tensor: :class:`torch.Tensor`
|
||||
:param output_tensor_grad: gradient of output tensor for this pipeline stage
|
||||
:type output_tensor_grad: :class:`torch.Tensor`
|
||||
|
||||
:return: gradient of input tensor
|
||||
:rtype: :class:`torch.Tensor`
|
||||
"""
|
||||
|
||||
# Retain the grad on the input_tensor.
|
||||
|
@ -189,7 +211,17 @@ class PipelineSchedule(BaseSchedule):
|
|||
"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
|
||||
Returns a tuple with losses if the last stage, an empty tuple otherwise.
|
||||
|
||||
:param engine: your engine object
|
||||
:type engine: colossalai.engine.Engine
|
||||
:param data_iter: dataloader as the form of an iterator, obtained by calling iter(dataloader)
|
||||
:type data_iter: Iterable
|
||||
:param forward_only: whether run forward step only. Default is false. If true, no backward will be run.
|
||||
:type forward_only: bool
|
||||
:param return_loss: whether returns the loss value. Default is true.
|
||||
:type return_loss: bool
|
||||
|
||||
:return: (output, label, loss)
|
||||
:rtype: Tuple[:class:`torch.Tensor`]
|
||||
"""
|
||||
|
||||
assert forward_only or return_loss, \
|
||||
|
|
|
@ -82,6 +82,8 @@ def launch(config: Union[str, Path, Config, Dict],
|
|||
:param local_rank: rank for the process on the node and is used to set the default CUDA device,
|
||||
defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
|
||||
:type local_rank: int, optional
|
||||
:param verbose: whether to print logs
|
||||
:type verbose: bool
|
||||
:raises Exception: raise exception when config type is wrong
|
||||
'''
|
||||
gpc.verbose = verbose
|
||||
|
@ -121,6 +123,20 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
|
|||
backend: str = 'nccl',
|
||||
seed: int = 1024,
|
||||
verbose: bool = True):
|
||||
'''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
|
||||
set by SLURM
|
||||
|
||||
:param config: config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param host: the master address for distributed training
|
||||
:type host: str
|
||||
:param port: the master port for distributed training
|
||||
:type port: str
|
||||
:param backend: backend for torch.distributed
|
||||
:type backend: str
|
||||
:param verbose: whether to print logs
|
||||
:type verbose: bool
|
||||
'''
|
||||
rank = int(os.environ['SLURM_PROCID'])
|
||||
world_size = int(os.environ['SLURM_NPROCS'])
|
||||
launch(config=config,
|
||||
|
@ -139,6 +155,20 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
|
|||
backend: str = 'nccl',
|
||||
seed: int = 1024,
|
||||
verbose: bool = True):
|
||||
'''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
|
||||
set by OpenMPI
|
||||
|
||||
:param config: config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param host: the master address for distributed training
|
||||
:type host: str
|
||||
:param port: the master port for distributed training
|
||||
:type port: str
|
||||
:param backend: backend for torch.distributed
|
||||
:type backend: str
|
||||
:param verbose: whether to print logs
|
||||
:type verbose: bool
|
||||
'''
|
||||
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
|
||||
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
|
||||
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
|
||||
|
@ -159,6 +189,20 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
|
|||
backend: str = 'nccl',
|
||||
seed: int = 1024,
|
||||
verbose: bool = True):
|
||||
'''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
|
||||
from the environment variables set by PyTorch
|
||||
|
||||
:param config: config file or config file path are both acceptable
|
||||
:type config: Union[str, dict, Config]
|
||||
:param host: the master address for distributed training
|
||||
:type host: str
|
||||
:param port: the master port for distributed training
|
||||
:type port: str
|
||||
:param backend: backend for torch.distributed
|
||||
:type backend: str
|
||||
:param verbose: whether to print logs
|
||||
:type verbose: bool
|
||||
'''
|
||||
rank = int(os.environ['RANK'])
|
||||
local_rank = int(os.environ['LOCAL_RANK'])
|
||||
world_size = int(os.environ['WORLD_SIZE'])
|
||||
|
@ -184,16 +228,20 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
|
|||
''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
|
||||
|
||||
:param model: your model instance
|
||||
:type model: a single or a list of ``torch.nn.Module`` objects
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer instance
|
||||
:type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects
|
||||
:type optimizer: :class:`torch.optim.optimizer.Optimizer`
|
||||
:param criterion: your criterion instance
|
||||
:type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects
|
||||
:param train_dataloader: dataloaders for training data
|
||||
:type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
|
||||
:param train_dataloader: dataloaders for testing data
|
||||
:type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None
|
||||
:return: (engine, criterion, train_dataloader, test_dataloader)
|
||||
:type criterion: :class:`torch.nn.modules.loss._Loss`
|
||||
:param train_dataloader: dataloader for training data
|
||||
:type train_dataloader: :class:`torch.utils.data.DataLoader`
|
||||
:param train_dataloader: dataloader for testing data
|
||||
:type train_dataloader: :class:`torch.utils.data.DataLoader`
|
||||
:param lr_scheduler: your lr scheduler instance
|
||||
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
|
||||
:param verbose: whether to print logs
|
||||
:type verbose: bool
|
||||
:return: (engine, train_dataloader, test_dataloader, lr_scheduler)
|
||||
:rtype: tuple
|
||||
'''
|
||||
# get logger
|
||||
|
|
|
@ -6,5 +6,11 @@ __all__ = ['get_dist_logger', 'DistributedLogger']
|
|||
def get_dist_logger(name='root'):
|
||||
"""Get logger instance based on name. The DistributedLogger will create singleton instances,
|
||||
which means that only one logger instance is created per name.
|
||||
|
||||
:param name: name of the logger, name must be unique
|
||||
:type name: str
|
||||
|
||||
:return: a distributed logger instance
|
||||
:rtype: :class:`colossalai.logging.DistributedLogger`
|
||||
"""
|
||||
return DistributedLogger.get_instance(name=name)
|
||||
|
|
|
@ -47,9 +47,24 @@ class ViTBlock(nn.Module):
|
|||
@LAYERS.register_module
|
||||
class VanillaViTPatchEmbedding(nn.Module):
|
||||
""" 2D Image to Patch Embedding
|
||||
|
||||
:param img_size: image size
|
||||
:type img_size: int
|
||||
:param patch_size: size of a patch
|
||||
:type patch_size: int
|
||||
:param in_chans: input channels
|
||||
:type in_chans: int
|
||||
:param embed_dim: embedding dimension
|
||||
:type embed_dim: int
|
||||
:param norm_layer: layer norm class, defaults to None
|
||||
:type norm_layer: Callable
|
||||
:param flattern: whether flatten the output
|
||||
:type flatten: bool
|
||||
:param drop: dropout rate
|
||||
:type drop: float
|
||||
"""
|
||||
|
||||
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True, drop=0.):
|
||||
def __init__(self, img_size, patch_size, in_chans, embed_dim, norm_layer=None, flatten=True, drop=0.):
|
||||
super().__init__()
|
||||
img_size = to_2tuple(img_size)
|
||||
patch_size = to_2tuple(patch_size)
|
||||
|
@ -84,12 +99,22 @@ class VanillaViTPatchEmbedding(nn.Module):
|
|||
@LAYERS.register_module
|
||||
class VanillaViTMLP(nn.Module):
|
||||
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
|
||||
|
||||
:param in_features: input channels
|
||||
:type in_features: int
|
||||
:param hidden_features: channels of the output of the first dense layer
|
||||
:type hidden_features: int
|
||||
:param hidden_features: channels of the output of the second dense layer
|
||||
:type hidden_features: int
|
||||
:param act_layer: activation function
|
||||
:type act_layer: Callable
|
||||
:param drop: dropout rate
|
||||
:type drop: float
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
|
||||
def __init__(self, in_features, hidden_features, out_features, act_layer=nn.GELU, drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
|
@ -113,6 +138,11 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
|
|||
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
||||
'survival rate' as the argument.
|
||||
|
||||
:param drop_prob: probability for dropout
|
||||
:type drop_prob: float
|
||||
:param training: whether it is training mode
|
||||
:type training: bool
|
||||
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
|
@ -129,6 +159,9 @@ def drop_path(x, drop_prob: float = 0., training: bool = False):
|
|||
@LAYERS.register_module
|
||||
class VanillaViTDropPath(nn.Module):
|
||||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
|
||||
:param drop_prob: probability for dropout
|
||||
:type drop_path: float
|
||||
"""
|
||||
|
||||
def __init__(self, drop_prob=0.):
|
||||
|
@ -145,7 +178,7 @@ class VanillaViTAttention(nn.Module):
|
|||
|
||||
:param dim: dimension of input tensor
|
||||
:type dim: int
|
||||
:param num_heads: number of attention heads, defaults to 8
|
||||
:param num_heads: number of attention heads
|
||||
:type num_heads: int, optional
|
||||
:param qkv_bias: enable bias for qkv if True, defaults to False
|
||||
:type qkv_bias: bool, optional
|
||||
|
@ -155,7 +188,7 @@ class VanillaViTAttention(nn.Module):
|
|||
:type proj_drop: float, optional
|
||||
"""
|
||||
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
|
||||
def __init__(self, dim, num_heads, qkv_bias=False, attn_drop=0., proj_drop=0.):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
|
|
|
@ -109,15 +109,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
|
|||
added functionality to handle model parallel parameters. Note that
|
||||
the gradients are modified in place.
|
||||
|
||||
Arguments:
|
||||
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
|
||||
single Tensor that will have gradients normalized
|
||||
max_norm (float or int): max norm of the gradients
|
||||
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
|
||||
infinity norm.
|
||||
:param parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
|
||||
:type parameters: (Iterable[Tensor] or Tensor)
|
||||
:param max_norm: max norm of the gradients
|
||||
:type max_norm: float or int
|
||||
:param norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
|
||||
:type norm_type: float or int
|
||||
|
||||
Returns:
|
||||
Total norm of the parameters (viewed as a single vector).
|
||||
:return: Total norm of the parameters (viewed as a single vector).
|
||||
:rtype: float
|
||||
"""
|
||||
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
|
|
|
@ -123,12 +123,23 @@ def get_dataloader(dataset,
|
|||
stage and label on the last stage
|
||||
|
||||
:param dataset: a :class:utils.data.dataset dataset
|
||||
:param shuffle: whether to shuffle the dataset
|
||||
:param seed: random worker seed, defaults to 1024
|
||||
:type seed: int, optional
|
||||
:param add_sampler_if_possible: [description], defaults to False
|
||||
:type add_sampler_if_possible: bool, optional
|
||||
:return: a :class:utils.data.dataset dataloader
|
||||
:rtype: torch.utils.data.dataset
|
||||
:param add_sampler: add DistributedDataParallelSampelr to the dataset
|
||||
:param drop_last: drop the last incomplete batch of data
|
||||
:param pin_memory: whether to pin memory address in CPU memory
|
||||
:param num_workers: number of worker threads for this dataloader
|
||||
|
||||
:type dataset: :class:`torch.utils.data.Dataset`
|
||||
:type shuffle: bool, optional. Default is False
|
||||
:type seed: int, optional. Default is 1024
|
||||
:type add_sampler: bool, optional. Default is True
|
||||
:type drop_last: bool, optional. Default is False
|
||||
:type pin_memory: bool, optional. Default is False
|
||||
:type num_workers: int, optional. Default is 0
|
||||
|
||||
:return: a object of :class:`torch.utils.data.DataLoader`
|
||||
:rtype: :class:`torch.utils.data.DataLoader`
|
||||
'''
|
||||
_kwargs = kwargs.copy()
|
||||
|
||||
|
|
|
@ -13,6 +13,20 @@ def accumulate_gradient(model: nn.Module,
|
|||
accumulate_size: int,
|
||||
gradient_handlers: List[BaseGradientHandler] = None,
|
||||
lr_scheduler: _LRScheduler = None):
|
||||
"""
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param dataloader: your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumulate_size: int
|
||||
:param gradient_handlers: list of gradient handler objects. Default is None
|
||||
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
|
||||
:param lr_scheduler: your lr scheduler object. Default is None
|
||||
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
|
||||
"""
|
||||
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
|
||||
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
|
||||
|
||||
|
|
|
@ -14,6 +14,17 @@ from colossalai.engine import BaseGradientHandler
|
|||
|
||||
|
||||
class GradAccumOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param optim: your optimizer object
|
||||
:type optim: :class:`torch.optim.Optimizer`
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
:param model: your model object to check if it is DDP for special handling of no_sync() context
|
||||
:type model: :class:`torch.nn.Module`
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
|
||||
super().__init__(optim)
|
||||
|
@ -64,6 +75,19 @@ class GradAccumOptimizer(ColossalaiOptimizer):
|
|||
|
||||
|
||||
class GradAccumDataloader():
|
||||
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
|
||||
|
||||
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
|
||||
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
|
||||
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
|
||||
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
|
||||
|
||||
:param dataloader: your dataloader object
|
||||
:type dataloader: Iterable
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
|
||||
self.dataloader = dataloader
|
||||
|
@ -99,6 +123,15 @@ class GradAccumDataloader():
|
|||
|
||||
|
||||
class GradAccumLrSchedulerByStep(_LRScheduler):
|
||||
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param lr_scheduler: your lr scheduler object
|
||||
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
|
||||
self.lr_scheduler = lr_scheduler
|
||||
|
@ -137,6 +170,15 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
|
|||
|
||||
|
||||
class GradAccumGradientHandler():
|
||||
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
|
||||
before accumulation size is reached
|
||||
|
||||
:param grad_handler: your gradient handler object
|
||||
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
|
||||
:param accumulate_size: the number of steps to accumulate gradients
|
||||
:type accumualate_size: int
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
|
||||
assert isinstance(grad_handler, BaseGradientHandler), \
|
||||
|
|
|
@ -34,6 +34,10 @@ def report_memory_usage(message, logger=None, report_cpu=False):
|
|||
|
||||
:param message: a prefix message to add in the log
|
||||
:type message: str
|
||||
:param logger: an instance of :class:`colossalai.logging.DistributedLogger`
|
||||
:type logger: :class:`colossalai.logging.DistributedLogger`
|
||||
:param report_cpu: whether to report CPU memory
|
||||
:type report_cpu: bool
|
||||
:raises EnvironmentError: raise error if no distributed environment has been initialized
|
||||
'''
|
||||
if not gpc.is_initialized(ParallelMode.GLOBAL):
|
||||
|
|
|
@ -2,6 +2,13 @@
|
|||
|
||||
|
||||
class MultiTensorApply(object):
|
||||
"""
|
||||
Apply an operation to a list of tensors efficiently
|
||||
|
||||
:param chunk_size: size of a chunk
|
||||
:type chunk_size: int
|
||||
"""
|
||||
|
||||
available = False
|
||||
warned = False
|
||||
|
||||
|
|
|
@ -74,6 +74,9 @@ class Timer:
|
|||
|
||||
class MultiTimer:
|
||||
'''An object contains multiple timers
|
||||
|
||||
:param on: whether the timer is enabled. Default is True
|
||||
:type on: bool
|
||||
'''
|
||||
|
||||
def __init__(self, on: bool = True):
|
||||
|
|
|
@ -14,6 +14,21 @@ def convert_to_zero(model: nn.Module,
|
|||
optimizer: Optimizer,
|
||||
level: int,
|
||||
zero_config):
|
||||
"""
|
||||
A helper function to integrate the model and optimizer with ZeRO optimizer and off-loading
|
||||
|
||||
:param model: your model object
|
||||
:type model: :class:`torch.nn.Module`
|
||||
:param optimizer: your optimizer object
|
||||
:type optimizer: :class:`torch.optim.Optimizer`
|
||||
:param level: optimizer level, can be 2 or 3
|
||||
:type level: int
|
||||
:param zero_config: configuration for zero
|
||||
:type zero_config: dict
|
||||
|
||||
:return: (model, optimizer)
|
||||
:rtype: Tuple
|
||||
"""
|
||||
assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
|
||||
if level == 2:
|
||||
if is_no_pp_or_last_stage():
|
||||
|
|
|
@ -1,76 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
import os
|
||||
|
||||
IMG_SIZE = 224
|
||||
BATCH_SIZE = 256
|
||||
NUM_EPOCHS = 100
|
||||
|
||||
model = dict(
|
||||
type='VanillaResNet',
|
||||
block_type='ResNetBottleneck',
|
||||
layers=[3, 4, 6, 3],
|
||||
num_cls=10
|
||||
)
|
||||
|
||||
train_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=os.environ['DATA'],
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
|
||||
dict(type='RandomHorizontalFlip'),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
shuffle=True,
|
||||
)
|
||||
)
|
||||
|
||||
test_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=os.environ['DATA'],
|
||||
train=False,
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]
|
||||
),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
)
|
||||
)
|
||||
|
||||
parallelization = dict(
|
||||
pipeline=1,
|
||||
tensor=dict(size=1, mode=None),
|
||||
)
|
||||
|
||||
optimizer = dict(
|
||||
type='Adam',
|
||||
lr=0.01
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
type='CrossEntropyLoss'
|
||||
)
|
||||
|
||||
from colossalai.engine import AMP_TYPE
|
||||
|
||||
fp16 = dict(
|
||||
mode=AMP_TYPE.APEX,
|
||||
opt_level='O2',
|
||||
)
|
|
@ -1,22 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
NUM_EPOCH = int
|
||||
|
||||
model = dict()
|
||||
train_data = dict()
|
||||
test_data = dict()
|
||||
optimizer = dict()
|
||||
loss = dict()
|
||||
|
||||
fp16 = dict()
|
||||
zero = dict()
|
||||
|
||||
gradient_handler = []
|
||||
parallel = dict()
|
||||
hooks = []
|
||||
|
||||
cudnn_benchmark = True
|
||||
cudnn_deterministic = False
|
||||
|
||||
logging = dict()
|
|
@ -1,165 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
BATCH_SIZE = 512
|
||||
IMG_SIZE = 32
|
||||
PATCH_SIZE = 4
|
||||
DIM = 512
|
||||
NUM_ATTENTION_HEADS = 2
|
||||
SUMMA_DIM = 2
|
||||
NUM_CLASSES = 10
|
||||
DEPTH = 6
|
||||
NUM_EPOCHS = 60
|
||||
|
||||
train_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
|
||||
dict(type='RandomHorizontalFlip'),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
drop_last=True,
|
||||
pin_memory=True,
|
||||
shuffle=True,
|
||||
)
|
||||
)
|
||||
|
||||
test_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
train=False,
|
||||
transform_pipeline=[
|
||||
dict(type='Resize', size=IMG_SIZE),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]
|
||||
),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
)
|
||||
)
|
||||
|
||||
optimizer = dict(
|
||||
type='Adam',
|
||||
lr=0.001,
|
||||
weight_decay=0
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
type='CrossEntropyLoss2D',
|
||||
)
|
||||
|
||||
model = dict(
|
||||
type='VisionTransformerFromConfig',
|
||||
tensor_splitting_cfg=dict(
|
||||
type='ViTInputSplitter2D',
|
||||
),
|
||||
embedding_cfg=dict(
|
||||
type='ViTPatchEmbedding2D',
|
||||
img_size=IMG_SIZE,
|
||||
patch_size=PATCH_SIZE,
|
||||
embed_dim=DIM,
|
||||
),
|
||||
token_fusion_cfg=dict(
|
||||
type='ViTTokenFuser2D',
|
||||
img_size=IMG_SIZE,
|
||||
patch_size=PATCH_SIZE,
|
||||
embed_dim=DIM,
|
||||
drop_rate=0.1
|
||||
),
|
||||
norm_cfg=dict(
|
||||
type='LayerNorm2D',
|
||||
normalized_shape=DIM,
|
||||
eps=1e-6,
|
||||
),
|
||||
block_cfg=dict(
|
||||
type='ViTBlock',
|
||||
attention_cfg=dict(
|
||||
type='ViTSelfAttention2D',
|
||||
hidden_size=DIM,
|
||||
num_attention_heads=NUM_ATTENTION_HEADS,
|
||||
attention_dropout_prob=0.,
|
||||
hidden_dropout_prob=0.1,
|
||||
checkpoint=True
|
||||
),
|
||||
droppath_cfg=dict(
|
||||
type='VanillaViTDropPath',
|
||||
),
|
||||
mlp_cfg=dict(
|
||||
type='ViTMLP2D',
|
||||
in_features=DIM,
|
||||
dropout_prob=0.1,
|
||||
mlp_ratio=4,
|
||||
checkpoint=True
|
||||
),
|
||||
norm_cfg=dict(
|
||||
type='LayerNorm2D',
|
||||
normalized_shape=DIM,
|
||||
eps=1e-6,
|
||||
),
|
||||
),
|
||||
head_cfg=dict(
|
||||
type='ViTHead2D',
|
||||
hidden_size=DIM,
|
||||
num_classes=NUM_CLASSES,
|
||||
),
|
||||
embed_dim=DIM,
|
||||
depth=DEPTH,
|
||||
drop_path_rate=0.,
|
||||
)
|
||||
|
||||
hooks = [
|
||||
dict(type='LogMetricByEpochHook'),
|
||||
dict(type='Accuracy2DHook'),
|
||||
dict(type='LossHook'),
|
||||
dict(
|
||||
type='LRSchedulerHook',
|
||||
by_epoch=True,
|
||||
lr_scheduler_cfg=dict(
|
||||
type='LinearWarmupLR',
|
||||
warmup_steps=5
|
||||
)
|
||||
),
|
||||
# dict(type='TensorboardHook', log_dir='./tb_logs'),
|
||||
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
|
||||
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
|
||||
]
|
||||
|
||||
parallel = dict(
|
||||
pipeline=dict(size=1),
|
||||
tensor=dict(size=4, mode='2d'),
|
||||
)
|
||||
|
||||
# for fp16 training
|
||||
# from colossalai.engine import AMP_TYPE
|
||||
# fp16 = dict(
|
||||
# mode=AMP_TYPE.PARALLEL,
|
||||
# initial_scale=2 ** 8
|
||||
# )
|
||||
|
||||
# only needed when pipeline parallel is used
|
||||
# schedule = dict(
|
||||
# num_microbatches=8
|
||||
# )
|
||||
|
||||
|
||||
logging = dict(
|
||||
root_path='./logs'
|
||||
)
|
|
@ -1,111 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.engine import AMP_TYPE
|
||||
|
||||
try:
|
||||
import model_zoo
|
||||
except:
|
||||
print('You need to set model_zoo to your PYTHONPATH to use the models in the collection')
|
||||
|
||||
BATCH_SIZE = 512
|
||||
IMG_SIZE = 32
|
||||
NUM_EPOCHS = 60
|
||||
|
||||
train_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
transform_pipeline=[
|
||||
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
|
||||
dict(type='RandomHorizontalFlip'),
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
num_workers=2,
|
||||
shuffle=True,
|
||||
)
|
||||
)
|
||||
|
||||
test_data = dict(
|
||||
dataset=dict(
|
||||
type='CIFAR10Dataset',
|
||||
root=Path(os.environ['DATA']),
|
||||
train=False,
|
||||
transform_pipeline=[
|
||||
dict(type='ToTensor'),
|
||||
dict(type='Normalize',
|
||||
mean=[0.4914, 0.4822, 0.4465],
|
||||
std=[0.2023, 0.1994, 0.2010]
|
||||
),
|
||||
]
|
||||
),
|
||||
dataloader=dict(
|
||||
batch_size=BATCH_SIZE,
|
||||
pin_memory=True,
|
||||
num_workers=2,
|
||||
)
|
||||
)
|
||||
|
||||
optimizer = dict(
|
||||
type='Adam',
|
||||
lr=0.001
|
||||
)
|
||||
|
||||
loss = dict(
|
||||
type='CrossEntropyLoss3D',
|
||||
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
|
||||
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
|
||||
)
|
||||
|
||||
model = dict(
|
||||
type='vit_tiny_3d_patch4_32',
|
||||
drop_rate=0.1,
|
||||
)
|
||||
|
||||
hooks = [
|
||||
dict(type='LogMetricByEpochHook'),
|
||||
dict(type='LogTimingByEpochHook'),
|
||||
dict(type='LogMemoryByEpochHook'),
|
||||
dict(
|
||||
type='Accuracy3DHook',
|
||||
input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT,
|
||||
weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT,
|
||||
),
|
||||
dict(type='LossHook'),
|
||||
dict(type='TensorboardHook', log_dir='./tfb_logs'),
|
||||
dict(
|
||||
type='LRSchedulerHook',
|
||||
by_epoch=True,
|
||||
lr_scheduler_cfg=dict(
|
||||
type='LinearWarmupLR',
|
||||
warmup_steps=5
|
||||
)
|
||||
),
|
||||
# dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'),
|
||||
# dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt')
|
||||
]
|
||||
|
||||
parallel = dict(
|
||||
pipeline=dict(size=1),
|
||||
tensor=dict(size=8, mode='3d'),
|
||||
)
|
||||
|
||||
fp16 = dict(
|
||||
mode=AMP_TYPE.PARALLEL,
|
||||
initial_scale=2 ** 8
|
||||
)
|
||||
|
||||
logging = dict(
|
||||
root_path='./logs'
|
||||
)
|
|
@ -77,10 +77,10 @@ fp16 = dict(
|
|||
)
|
||||
```
|
||||
|
||||
## Tensor Parallel AMP
|
||||
## Naive AMP
|
||||
|
||||
We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor
|
||||
and pipeline parallelism.
|
||||
and pipeline parallelism. This AMP mode will cast all operations into fp16.
|
||||
|
||||
The following conde block show a config file for this mode.
|
||||
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.amp.apex\_amp
|
||||
==========================
|
||||
|
||||
.. automodule:: colossalai.amp.apex_amp
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.amp.naive\_amp
|
||||
==========================
|
||||
|
||||
.. automodule:: colossalai.amp.naive_amp
|
||||
:members:
|
|
@ -0,0 +1,13 @@
|
|||
colossalai.amp
|
||||
==================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.amp.torch_amp
|
||||
colossalai.amp.apex_amp
|
||||
colossalai.amp.naive_amp
|
||||
|
||||
|
||||
.. automodule:: colossalai.amp
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.amp.torch\_amp
|
||||
==========================
|
||||
|
||||
.. automodule:: colossalai.amp.torch_amp
|
||||
:members:
|
|
@ -1,12 +1,12 @@
|
|||
colossalai.builder
|
||||
==================
|
||||
|
||||
.. automodule:: colossalai.builder
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.builder.builder
|
||||
colossalai.builder.pipeline
|
||||
|
||||
|
||||
.. automodule:: colossalai.builder
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.checkpointing
|
||||
========================
|
||||
|
||||
.. automodule:: colossalai.checkpointing
|
||||
:members:
|
|
@ -1,10 +1,6 @@
|
|||
colossalai.communication
|
||||
========================
|
||||
|
||||
.. automodule:: colossalai.communication
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -12,3 +8,7 @@ colossalai.communication
|
|||
colossalai.communication.p2p
|
||||
colossalai.communication.ring
|
||||
colossalai.communication.utils
|
||||
|
||||
|
||||
.. automodule:: colossalai.communication
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.context.random
|
||||
=========================
|
||||
|
||||
.. automodule:: colossalai.context.random
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.context.random.seed_manager
|
||||
|
||||
|
||||
.. automodule:: colossalai.context.random
|
||||
:members:
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
colossalai.context
|
||||
==================
|
||||
|
||||
.. automodule:: colossalai.context
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -17,3 +14,7 @@ colossalai.context
|
|||
colossalai.context.config
|
||||
colossalai.context.parallel_context
|
||||
colossalai.context.parallel_mode
|
||||
|
||||
|
||||
.. automodule:: colossalai.context
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.engine.amp.amp\_type
|
||||
===============================
|
||||
|
||||
.. automodule:: colossalai.engine.amp.amp_type
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.engine.amp.grad\_scaler
|
||||
==================================
|
||||
|
||||
.. automodule:: colossalai.engine.amp.grad_scaler
|
||||
:members:
|
|
@ -1,12 +0,0 @@
|
|||
colossalai.engine.amp
|
||||
=====================
|
||||
|
||||
.. automodule:: colossalai.engine.amp
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.engine.amp.amp_type
|
||||
colossalai.engine.amp.grad_scaler
|
|
@ -1,12 +1,12 @@
|
|||
colossalai.engine
|
||||
=================
|
||||
|
||||
.. automodule:: colossalai.engine
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.engine.amp
|
||||
colossalai.engine.gradient_handler
|
||||
colossalai.engine.schedule
|
||||
|
||||
|
||||
.. automodule:: colossalai.engine
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.logging
|
||||
==================
|
||||
|
||||
.. automodule:: colossalai.logging
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.logging.logging
|
||||
|
||||
|
||||
.. automodule:: colossalai.logging
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.data.base\_dataset
|
||||
================================
|
||||
|
||||
.. automodule:: colossalai.nn.data.base_dataset
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.data.caltech101\_dataset
|
||||
======================================
|
||||
|
||||
.. automodule:: colossalai.nn.data.caltech101_dataset
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.data.cifar10\_dataset
|
||||
===================================
|
||||
|
||||
.. automodule:: colossalai.nn.data.cifar10_dataset
|
||||
:members:
|
|
@ -1,18 +0,0 @@
|
|||
colossalai.nn.data
|
||||
==================
|
||||
|
||||
.. automodule:: colossalai.nn.data
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.data.sampler
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.data.base_dataset
|
||||
colossalai.nn.data.caltech101_dataset
|
||||
colossalai.nn.data.cifar10_dataset
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.data.sampler.base\_sampler
|
||||
========================================
|
||||
|
||||
.. automodule:: colossalai.nn.data.sampler.base_sampler
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.data.sampler.data\_parallel\_sampler
|
||||
==================================================
|
||||
|
||||
.. automodule:: colossalai.nn.data.sampler.data_parallel_sampler
|
||||
:members:
|
|
@ -1,12 +0,0 @@
|
|||
colossalai.nn.data.sampler
|
||||
==========================
|
||||
|
||||
.. automodule:: colossalai.nn.data.sampler
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.data.sampler.base_sampler
|
||||
colossalai.nn.data.sampler.data_parallel_sampler
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.nn.layer.non\_parallel\_layers
|
||||
======================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.non_parallel_layers
|
||||
:members:
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.nn.layer.parallel\_1d
|
||||
================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_1d
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.parallel_1d.layers
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_1d
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.nn.layer.parallel\_2d
|
||||
================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_2d
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.parallel_2d.layers
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_2d
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.nn.layer.parallel\_2p5d
|
||||
==================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_2p5d
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.parallel_2p5d.layers
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_2p5d
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.nn.layer.parallel\_3d
|
||||
================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_3d
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.parallel_3d.layers
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_3d
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.nn.layer.parallel\_sequence
|
||||
======================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_sequence
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.parallel_sequence.layers
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_sequence
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.layer.parallel\_vision\_transformer.layers
|
||||
========================================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_vision_transformer.layers
|
||||
:members:
|
|
@ -1,11 +0,0 @@
|
|||
colossalai.nn.layer.parallel\_vision\_transformer
|
||||
=================================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.parallel_vision_transformer
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.parallel_vision_transformer.layers
|
|
@ -1,9 +1,6 @@
|
|||
colossalai.nn.layer
|
||||
===================
|
||||
|
||||
.. automodule:: colossalai.nn.layer
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -12,13 +9,10 @@ colossalai.nn.layer
|
|||
colossalai.nn.layer.parallel_2p5d
|
||||
colossalai.nn.layer.parallel_3d
|
||||
colossalai.nn.layer.parallel_sequence
|
||||
colossalai.nn.layer.parallel_vision_transformer
|
||||
colossalai.nn.layer.vanilla_resnet
|
||||
colossalai.nn.layer.vanilla_vision_transformer
|
||||
colossalai.nn.layer.non_parallel_layers
|
||||
colossalai.nn.layer.wrapper
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.base_layer
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.layer
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_resnet.basic\_block
|
||||
================================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_resnet.basic_block
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_resnet.bottleneck
|
||||
==============================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_resnet.bottleneck
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_resnet.conv
|
||||
========================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_resnet.conv
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_resnet.reslayer
|
||||
============================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_resnet.reslayer
|
||||
:members:
|
|
@ -1,14 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_resnet
|
||||
===================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_resnet
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.vanilla_resnet.basic_block
|
||||
colossalai.nn.layer.vanilla_resnet.bottleneck
|
||||
colossalai.nn.layer.vanilla_resnet.conv
|
||||
colossalai.nn.layer.vanilla_resnet.reslayer
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_vision\_transformer.layers
|
||||
=======================================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_vision_transformer.layers
|
||||
:members:
|
|
@ -1,11 +0,0 @@
|
|||
colossalai.nn.layer.vanilla\_vision\_transformer
|
||||
================================================
|
||||
|
||||
.. automodule:: colossalai.nn.layer.vanilla_vision_transformer
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.layer.vanilla_vision_transformer.layers
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.loss.base\_loss
|
||||
=============================
|
||||
|
||||
.. automodule:: colossalai.nn.loss.base_loss
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.loss.cross\_entropy\_1d
|
||||
=====================================
|
||||
|
||||
.. automodule:: colossalai.nn.loss.cross_entropy_1d
|
||||
:members:
|
|
@ -1,15 +1,13 @@
|
|||
colossalai.nn.loss
|
||||
==================
|
||||
|
||||
.. automodule:: colossalai.nn.loss
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.loss.base_loss
|
||||
colossalai.nn.loss.cross_entropy_1d
|
||||
colossalai.nn.loss.cross_entropy_2d
|
||||
colossalai.nn.loss.cross_entropy_2p5d
|
||||
colossalai.nn.loss.cross_entropy_3d
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.loss
|
||||
:members:
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
colossalai.nn.lr\_scheduler
|
||||
===========================
|
||||
|
||||
.. automodule:: colossalai.nn.lr_scheduler
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -15,3 +11,7 @@ colossalai.nn.lr\_scheduler
|
|||
colossalai.nn.lr_scheduler.onecycle
|
||||
colossalai.nn.lr_scheduler.poly
|
||||
colossalai.nn.lr_scheduler.torch
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.lr_scheduler
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.model.base\_model
|
||||
===============================
|
||||
|
||||
.. automodule:: colossalai.nn.model.base_model
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.nn.model.model\_from\_config
|
||||
===============================
|
||||
|
||||
.. automodule:: colossalai.nn.model.model_from_config
|
||||
:members:
|
|
@ -1,17 +1,7 @@
|
|||
colossalai.nn.model
|
||||
===================
|
||||
|
||||
.. automodule:: colossalai.nn.model
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.model.vanilla_resnet
|
||||
colossalai.nn.model.vision_transformer
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.model.base_model
|
||||
colossalai.nn.model.model_from_config
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.model.vanilla\_resnet.resnet
|
||||
==========================================
|
||||
|
||||
.. automodule:: colossalai.nn.model.vanilla_resnet.resnet
|
||||
:members:
|
|
@ -1,11 +0,0 @@
|
|||
colossalai.nn.model.vanilla\_resnet
|
||||
===================================
|
||||
|
||||
.. automodule:: colossalai.nn.model.vanilla_resnet
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.model.vanilla_resnet.resnet
|
|
@ -1,11 +0,0 @@
|
|||
colossalai.nn.model.vision\_transformer
|
||||
=======================================
|
||||
|
||||
.. automodule:: colossalai.nn.model.vision_transformer
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.model.vision_transformer.vision_transformer
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.model.vision\_transformer.vision\_transformer
|
||||
===========================================================
|
||||
|
||||
.. automodule:: colossalai.nn.model.vision_transformer.vision_transformer
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.multi\_tensor\_apply.multi\_tensor\_apply
|
||||
=======================================================
|
||||
|
||||
.. automodule:: colossalai.nn.multi_tensor_apply.multi_tensor_apply
|
||||
:members:
|
|
@ -1,11 +0,0 @@
|
|||
colossalai.nn.multi\_tensor\_apply
|
||||
==================================
|
||||
|
||||
.. automodule:: colossalai.nn.multi_tensor_apply
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.multi_tensor_apply.multi_tensor_apply
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.optimizer.fp16\_optimizer
|
||||
=======================================
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer.fp16_optimizer
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.optimizer.loss\_scaler
|
||||
====================================
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer.loss_scaler
|
||||
:members:
|
|
@ -1,20 +1,15 @@
|
|||
colossalai.nn.optimizer
|
||||
=======================
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.optimizer.fp16_optimizer
|
||||
colossalai.nn.optimizer.fused_adam
|
||||
colossalai.nn.optimizer.fused_lamb
|
||||
colossalai.nn.optimizer.fused_sgd
|
||||
colossalai.nn.optimizer.lamb
|
||||
colossalai.nn.optimizer.lars
|
||||
colossalai.nn.optimizer.loss_scaler
|
||||
colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
|
||||
colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
|
||||
colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer
|
||||
:members:
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_1
|
||||
=============================================================
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_1
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_2
|
||||
=============================================================
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_2
|
||||
:members:
|
|
@ -1,5 +0,0 @@
|
|||
colossalai.nn.optimizer.zero\_redundancy\_optimizer\_level\_3
|
||||
=============================================================
|
||||
|
||||
.. automodule:: colossalai.nn.optimizer.zero_redundancy_optimizer_level_3
|
||||
:members:
|
|
@ -1,16 +1,15 @@
|
|||
colossalai.nn
|
||||
=============
|
||||
|
||||
.. automodule:: colossalai.nn
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.nn.data
|
||||
colossalai.nn.layer
|
||||
colossalai.nn.loss
|
||||
colossalai.nn.lr_scheduler
|
||||
colossalai.nn.model
|
||||
colossalai.nn.multi_tensor_apply
|
||||
colossalai.nn.optimizer
|
||||
|
||||
|
||||
.. automodule:: colossalai.nn
|
||||
:members:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
colossalai.registry
|
||||
===================
|
||||
|
||||
.. automodule:: colossalai.registry
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.registry.registry
|
||||
|
||||
|
||||
.. automodule:: colossalai.registry
|
||||
:members:
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
colossalai
|
||||
==========
|
||||
|
||||
.. automodule:: colossalai
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.constants
|
||||
colossalai.core
|
||||
colossalai.initialize
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.amp
|
||||
colossalai.builder
|
||||
colossalai.communication
|
||||
colossalai.context
|
||||
|
@ -16,11 +22,7 @@ colossalai
|
|||
colossalai.registry
|
||||
colossalai.trainer
|
||||
colossalai.utils
|
||||
colossalai.zero
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
colossalai.constants
|
||||
colossalai.core
|
||||
colossalai.initialize
|
||||
.. automodule:: colossalai
|
||||
:members:
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
colossalai.trainer
|
||||
==================
|
||||
|
||||
.. automodule:: colossalai.trainer
|
||||
:members:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -14,3 +11,7 @@ colossalai.trainer
|
|||
:maxdepth: 2
|
||||
|
||||
colossalai.trainer.metric
|
||||
|
||||
|
||||
.. automodule:: colossalai.trainer
|
||||
:members:
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.utils.data\_sampler
|
||||
=======================================
|
||||
|
||||
.. automodule:: colossalai.utils.data_sampler
|
||||
:members:
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.utils.gradient\_accumulation
|
||||
=======================================
|
||||
|
||||
.. automodule:: colossalai.utils.gradient_accumulation
|
||||
:members:
|
|
@ -0,0 +1,8 @@
|
|||
colossalai.nn.multi\_tensor\_apply
|
||||
==================================
|
||||
|
||||
.. automodule:: colossalai.utils.multi_tensor_apply.multi_tensor_apply
|
||||
:members:
|
||||
|
||||
|
||||
|
|
@ -1,10 +1,6 @@
|
|||
colossalai.utils
|
||||
================
|
||||
|
||||
.. automodule:: colossalai.utils
|
||||
:members:
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
@ -12,5 +8,12 @@ colossalai.utils
|
|||
colossalai.utils.checkpointing
|
||||
colossalai.utils.common
|
||||
colossalai.utils.cuda
|
||||
colossalai.utils.data_sampler
|
||||
colossalai.utils.gradient_accumulation
|
||||
colossalai.utils.memory
|
||||
colossalai.utils.multi_tensor_apply
|
||||
colossalai.utils.timer
|
||||
|
||||
|
||||
.. automodule:: colossalai.utils
|
||||
:members:
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
colossalai.zero
|
||||
================
|
||||
|
||||
.. automodule:: colossalai.zero
|
||||
:members:
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue