ColossalAI/colossalai/zero/__init__.py

import torch
import torch.nn as nn
from torch.optim import Optimizer
from colossalai.amp.naive_amp import NaiveAMPModel
from colossalai.utils import is_no_pp_or_last_stage
from colossalai.core import global_context as gpc
from colossalai.context.parallel_mode import ParallelMode

from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2
from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3


def convert_to_zero(model: nn.Module,
                    optimizer: Optimizer,
                    level: int,
                    zero_config):
    assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'
    if level == 2:
        if is_no_pp_or_last_stage():
            model = NaiveAMPModel(model, output_to_fp32=True)
        else:
            model = NaiveAMPModel(model, output_to_fp32=False)

    if level == 2:
        optimizer = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, **zero_config)
    else:
        optimizer = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, module=model, **zero_config)
    return model, optimizer


def zero3_model_context(dtype=torch.half):
    """A context to enable massive model construction for training with
        ZeRO-3. Models are automatically partitioned (or, sharded) across the
        system and converted to half precision. Note that the config of ZeRO-3 will be loaded automatically from `gpc.config`.

        Args:
            dtype (``dtype``, optional): Can be used to change the data type of the parameters.
                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half``

        This context accelerates model initialization and enables models that
        are too large to allocate in their entirety in CPU memory. It has the
        following effects:

        #. allocates tensors to either GPU or CPU memory or NVMe
        #. converts floating point tensors to half precision
        #. immediately partitions tensors among the group of data-parallel devices
        #. (*optional*) replaces ``torch.nn.functional.linear`` with a more
           memory-efficient implementation

        These modifications allow for models that exceed the size of local CPU/GPU
        memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
        or GPU memory or NVMe) across all nodes. Consider initializing a model with one
        trillion parameters, whose weights occupy two terabytes (TB) in half
        precision. The initial CPU allocation in full precision requires 4TB of
        memory *per process*, and so a system with 8 GPUs per node would need 32TB of
        CPU memory due to data-parallel redundancies. Instead, by immediately
        partitioning tensors we remove the redundancies. The result is that
        regardless of the number of GPUs, we still only require the original 4TB. This
        allows for a linear increase in model size with the aggregate system memory.
        For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
        parameter model with 4 nodes and 32 GPUs.

        Important: If the fp16 weights of the model can't fit onto a single GPU memory
        this feature must be used.

        Examples
        --------

        #. Allocate a model and partition it among all processes:

            .. code-block:: python

                with zero3_model_context():
                    model = MyLargeModel()

    """
    assert dtype == torch.half or dtype == torch.float, f'Invalid dtype, except torch.half or torch.float, got {dtype}'
    import deepspeed
    ds_config = {
        "train_micro_batch_size_per_gpu": 1,
        "gradient_accumulation_steps": 1,
        "zero_optimization": {
            "offload_param": getattr(gpc.config.zero, 'offload_param_config', None),
            "offload_optimizer": getattr(gpc.config.zero, 'offload_optimizer_config'),
        },
        "aio": getattr(gpc.config.zero, 'aio_config', None)
    }
    remote_device = getattr(ds_config['zero_optimization']['offload_param'], 'device', None)
    pin_memory = getattr(ds_config['zero_optimization']['offload_param'], 'pin_memory', False)
    return deepspeed.zero.Init(data_parallel_group=gpc.get_group(ParallelMode.DATA),
                               remote_device=remote_device,
                               config_dict_or_path=ds_config,
                               pin_memory=pin_memory,
                               dtype=dtype)


__all__ = ['convert_to_zero', 'ZeroRedundancyOptimizer_Level_2',
           'ZeroRedundancyOptimizer_Level_3', 'zero3_model_context']
fix zero3 fp16 and add zero3 model context (#62) 3 years ago			`import torch`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago			`import torch.nn as nn`
			`from torch.optim import Optimizer`
			`from colossalai.amp.naive_amp import NaiveAMPModel`
			`from colossalai.utils import is_no_pp_or_last_stage`
fix zero3 fp16 and add zero3 model context (#62) 3 years ago			`from colossalai.core import global_context as gpc`
			`from colossalai.context.parallel_mode import ParallelMode`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago
			`from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2`
			`from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3`


			`def convert_to_zero(model: nn.Module,`
			`optimizer: Optimizer,`
			`level: int,`
			`zero_config):`
			`assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided'`
fix zero3 fp16 and add zero3 model context (#62) 3 years ago			`if level == 2:`
			`if is_no_pp_or_last_stage():`
			`model = NaiveAMPModel(model, output_to_fp32=True)`
			`else:`
			`model = NaiveAMPModel(model, output_to_fp32=False)`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 3 years ago
			`if level == 2:`
			`optimizer = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, **zero_config)`
			`else:`
			`optimizer = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, module=model, **zero_config)`
			`return model, optimizer`


fix zero3 fp16 and add zero3 model context (#62) 3 years ago			`def zero3_model_context(dtype=torch.half):`
			`"""A context to enable massive model construction for training with`
			`ZeRO-3. Models are automatically partitioned (or, sharded) across the`
			system and converted to half precision. Note that the config of ZeRO-3 will be loaded automatically from `gpc.config`.

			`Args:`
			dtype (``dtype``, optional): Can be used to change the data type of the parameters.
			Supported options are ``torch.half`` and ``torch.float``. Defaults to ``torch.half``

			`This context accelerates model initialization and enables models that`
			`are too large to allocate in their entirety in CPU memory. It has the`
			`following effects:`

			`#. allocates tensors to either GPU or CPU memory or NVMe`
			`#. converts floating point tensors to half precision`
			`#. immediately partitions tensors among the group of data-parallel devices`
			#. (optional) replaces ``torch.nn.functional.linear`` with a more
			`memory-efficient implementation`

			`These modifications allow for models that exceed the size of local CPU/GPU`
			`memory/NVMe, but fit within the total NVMe capacity (i.e., aggregate CPU`
			`or GPU memory or NVMe) across all nodes. Consider initializing a model with one`
			`trillion parameters, whose weights occupy two terabytes (TB) in half`
			`precision. The initial CPU allocation in full precision requires 4TB of`
			`memory per process, and so a system with 8 GPUs per node would need 32TB of`
			`CPU memory due to data-parallel redundancies. Instead, by immediately`
			`partitioning tensors we remove the redundancies. The result is that`
			`regardless of the number of GPUs, we still only require the original 4TB. This`
			`allows for a linear increase in model size with the aggregate system memory.`
			`For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion`
			`parameter model with 4 nodes and 32 GPUs.`

			`Important: If the fp16 weights of the model can't fit onto a single GPU memory`
			`this feature must be used.`

			`Examples`
			`--------`

			`#. Allocate a model and partition it among all processes:`

			`.. code-block:: python`

			`with zero3_model_context():`
			`model = MyLargeModel()`

			`"""`
			`assert dtype == torch.half or dtype == torch.float, f'Invalid dtype, except torch.half or torch.float, got {dtype}'`
			`import deepspeed`
			`ds_config = {`
			`"train_micro_batch_size_per_gpu": 1,`
			`"gradient_accumulation_steps": 1,`
			`"zero_optimization": {`
			`"offload_param": getattr(gpc.config.zero, 'offload_param_config', None),`
			`"offload_optimizer": getattr(gpc.config.zero, 'offload_optimizer_config'),`
			`},`
			`"aio": getattr(gpc.config.zero, 'aio_config', None)`
			`}`
			`remote_device = getattr(ds_config['zero_optimization']['offload_param'], 'device', None)`
			`pin_memory = getattr(ds_config['zero_optimization']['offload_param'], 'pin_memory', False)`
			`return deepspeed.zero.Init(data_parallel_group=gpc.get_group(ParallelMode.DATA),`
			`remote_device=remote_device,`
			`config_dict_or_path=ds_config,`
			`pin_memory=pin_memory,`
			`dtype=dtype)`


			`__all__ = ['convert_to_zero', 'ZeroRedundancyOptimizer_Level_2',`
			`'ZeroRedundancyOptimizer_Level_3', 'zero3_model_context']`