ColossalAI/colossalai/booster/booster.py

import warnings
from contextlib import contextmanager
from typing import Callable, Iterator, List, Optional, Tuple, Union

import torch
import torch.nn as nn
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
from torch.utils.data import DataLoader

from colossalai.checkpoint_io import GeneralCheckpointIO

from .accelerator import Accelerator
from .mixed_precision import MixedPrecision, mixed_precision_factory
from .plugin import Plugin

__all__ = ['Booster']


class Booster:
    """
    Booster is a high-level API for training neural networks. It provides a unified interface for
    training with different precision, accelerator, and plugin.

    Examples:
        >>> colossalai.launch(...)
        >>> plugin = GeminiPlugin(stage=3, ...)
        >>> booster = Booster(precision='fp16', plugin=plugin)
        >>>
        >>> model = GPT2()
        >>> optimizer = Adam(model.parameters())
        >>> dataloader = Dataloader(Dataset)
        >>> lr_scheduler = LinearWarmupScheduler()
        >>> criterion = GPTLMLoss()
        >>>
        >>> model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)
        >>>
        >>> for epoch in range(max_epochs):
        >>>     for input_ids, attention_mask in dataloader:
        >>>         outputs = model(input_ids, attention_mask)
        >>>         loss = criterion(outputs.logits, input_ids)
        >>>         booster.backward(loss, optimizer)
        >>>         optimizer.step()
        >>>         lr_scheduler.step()
        >>>         optimizer.zero_grad()


    Args:
        device (str or torch.device): The device to run the training. Default: 'cuda'.
        mixed_precision (str or MixedPrecision): The mixed precision to run the training. Default: None.
                                If the argument is a string, it can be 'fp16', 'fp16_apex', 'bf16', or 'fp8'.
                                'fp16' would use PyTorch AMP while `fp16_apex` would use Nvidia Apex.
        plugin (Plugin): The plugin to run the training. Default: None.
    """

    def __init__(self,
                 device: str = 'cuda',
                 mixed_precision: Union[MixedPrecision, str] = None,
                 plugin: Optional[Plugin] = None) -> None:
        if plugin is not None:
            assert isinstance(
                plugin, Plugin), f'Expected the argument plugin to be an instance of Plugin, but got {type(plugin)}.'
        self.plugin = plugin

        # set accelerator
        if self.plugin and self.plugin.control_device():
            self.accelerator = None
            warnings.warn('The plugin will control the accelerator, so the device argument will be ignored.')
        else:
            self.accelerator = Accelerator(device)

        # set precision
        if self.plugin and self.plugin.control_precision():
            warnings.warn('The plugin will control the precision, so the mixed_precision argument will be ignored.')
            self.mixed_precision = None
        elif mixed_precision is None:
            self.mixed_precision = None
        else:
            # validate and set precision
            if isinstance(mixed_precision, str):
                # the user will take the default arguments for amp training
                self.mixed_precision = mixed_precision_factory(mixed_precision)
            elif isinstance(mixed_precision, MixedPrecision):
                # the user can customize the arguments by passing the precision object
                self.mixed_precision = mixed_precision
            else:
                raise ValueError(
                    f'Expected the argument mixed_precision to be a string or an instance of Precision, but got {type(mixed_precision)}.'
                )

        if self.plugin is not None and self.plugin.control_checkpoint_io():
            self.checkpoint_io = self.plugin.get_checkpoint_io()
        else:
            self.checkpoint_io = GeneralCheckpointIO()

    def boost(
        self,
        model: nn.Module,
        optimizer: Optimizer,
        criterion: Callable = None,
        dataloader: DataLoader = None,
        lr_scheduler: LRScheduler = None,
    ) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:
        """
        Boost the model, optimizer, criterion, lr_scheduler, and dataloader.

        Args:
            model (nn.Module): The model to be boosted.
            optimizer (Optimizer): The optimizer to be boosted.
            criterion (Callable): The criterion to be boosted.
            dataloader (DataLoader): The dataloader to be boosted.
            lr_scheduler (LRScheduler): The lr_scheduler to be boosted.
        """
        # TODO(FrankLeeeee): consider multi-model and multi-optimizer case
        # TODO(FrankLeeeee): consider multi-dataloader case
        # transform model for mixed precision
        if self.plugin:
            model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(
                model, optimizer, criterion, dataloader, lr_scheduler)

        if self.plugin and not self.plugin.control_device():
            # transform model for accelerator
            model = self.accelerator.configure(model)

        if self.mixed_precision and (self.plugin is None or self.plugin and not self.plugin.control_precision()):
            # transform model for mixed precision
            # when mixed_precision is specified and the plugin is not given or does not control the precision
            model, optimizer, criterion = self.mixed_precision.configure(model, optimizer, criterion)

        return model, optimizer, criterion, dataloader, lr_scheduler

    def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
        # TODO: implement this method with plugin
        optimizer.backward(loss)

    def execute_pipeline(self,
                         data_iter: Iterator,
                         model: nn.Module,
                         criterion: Callable[[torch.Tensor], torch.Tensor],
                         optimizer: Optimizer,
                         return_loss: bool = True,
                         return_outputs: bool = False) -> Tuple[Optional[torch.Tensor], ...]:
        # TODO: implement this method
        # run pipeline forward backward pass
        # return loss or outputs if needed
        pass

    def no_sync(self, model: nn.Module) -> contextmanager:
        assert self.plugin is not None, f'no_sync is only enabled when a plugin is provided and the plugin supports no_sync.'
        assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'
        return self.plugin.no_sync(model)

    def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):
        self.checkpoint_io.load_model(model, checkpoint, strict)

    def save_model(self,
                   model: nn.Module,
                   checkpoint: str,
                   prefix: str = None,
                   shard: bool = False,
                   size_per_shard: int = 1024):
        self.checkpoint_io.save_model(model, checkpoint, prefix, shard, size_per_shard)

    def load_optimizer(self, optimizer: Optimizer, checkpoint: str):
        self.checkpoint_io.load_optimizer(optimizer, checkpoint)

    def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):
        self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, size_per_shard)

    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
        self.checkpoint_io.save_lr_scheduler(lr_scheduler, checkpoint)

    def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
        self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint)
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`import warnings`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00			`from contextlib import contextmanager`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`from typing import Callable, Iterator, List, Optional, Tuple, Union`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
			`import torch`
			`import torch.nn as nn`
			`from torch.optim import Optimizer`
			`from torch.optim.lr_scheduler import _LRScheduler as LRScheduler`
			`from torch.utils.data import DataLoader`

[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`from colossalai.checkpoint_io import GeneralCheckpointIO`

[booster] added the accelerator implementation (#3159) 2023-03-20 05:59:24 +00:00			`from .accelerator import Accelerator`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`from .mixed_precision import MixedPrecision, mixed_precision_factory`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00			`from .plugin import Plugin`

			`__all__ = ['Booster']`


			`class Booster:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`"""`
			`Booster is a high-level API for training neural networks. It provides a unified interface for`
[doc] Fix typo under colossalai and doc(#3618) * Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402 2023-04-26 03:38:43 +00:00			`training with different precision, accelerator, and plugin.`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00
			`Examples:`
			`>>> colossalai.launch(...)`
			`>>> plugin = GeminiPlugin(stage=3, ...)`
			`>>> booster = Booster(precision='fp16', plugin=plugin)`
			`>>>`
			`>>> model = GPT2()`
			`>>> optimizer = Adam(model.parameters())`
			`>>> dataloader = Dataloader(Dataset)`
			`>>> lr_scheduler = LinearWarmupScheduler()`
			`>>> criterion = GPTLMLoss()`
			`>>>`
			`>>> model, optimizer, lr_scheduler, dataloader = booster.boost(model, optimizer, lr_scheduler, dataloader)`
			`>>>`
			`>>> for epoch in range(max_epochs):`
			`>>> for input_ids, attention_mask in dataloader:`
			`>>> outputs = model(input_ids, attention_mask)`
			`>>> loss = criterion(outputs.logits, input_ids)`
			`>>> booster.backward(loss, optimizer)`
			`>>> optimizer.step()`
			`>>> lr_scheduler.step()`
			`>>> optimizer.zero_grad()`


			`Args:`
			`device (str or torch.device): The device to run the training. Default: 'cuda'.`
			`mixed_precision (str or MixedPrecision): The mixed precision to run the training. Default: None.`
			`If the argument is a string, it can be 'fp16', 'fp16_apex', 'bf16', or 'fp8'.`
			'fp16' would use PyTorch AMP while `fp16_apex` would use Nvidia Apex.
			`plugin (Plugin): The plugin to run the training. Default: None.`
			`"""`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
			`def __init__(self,`
[booster] added the accelerator implementation (#3159) 2023-03-20 05:59:24 +00:00			`device: str = 'cuda',`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`mixed_precision: Union[MixedPrecision, str] = None,`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00			`plugin: Optional[Plugin] = None) -> None:`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`if plugin is not None:`
			`assert isinstance(`
			`plugin, Plugin), f'Expected the argument plugin to be an instance of Plugin, but got {type(plugin)}.'`
			`self.plugin = plugin`

			`# set accelerator`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`if self.plugin and self.plugin.control_device():`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`self.accelerator = None`
			`warnings.warn('The plugin will control the accelerator, so the device argument will be ignored.')`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`else:`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`self.accelerator = Accelerator(device)`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`# set precision`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`if self.plugin and self.plugin.control_precision():`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`warnings.warn('The plugin will control the precision, so the mixed_precision argument will be ignored.')`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`self.mixed_precision = None`
			`elif mixed_precision is None:`
			`self.mixed_precision = None`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`else:`
			`# validate and set precision`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`if isinstance(mixed_precision, str):`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`# the user will take the default arguments for amp training`
			`self.mixed_precision = mixed_precision_factory(mixed_precision)`
			`elif isinstance(mixed_precision, MixedPrecision):`
			`# the user can customize the arguments by passing the precision object`
			`self.mixed_precision = mixed_precision`
			`else:`
			`raise ValueError(`
			`f'Expected the argument mixed_precision to be a string or an instance of Precision, but got {type(mixed_precision)}.'`
			`)`

[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`if self.plugin is not None and self.plugin.control_checkpoint_io():`
			`self.checkpoint_io = self.plugin.get_checkpoint_io()`
			`else:`
			`self.checkpoint_io = GeneralCheckpointIO()`

[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`def boost(`
			`self,`
			`model: nn.Module,`
			`optimizer: Optimizer,`
			`criterion: Callable = None,`
			`dataloader: DataLoader = None,`
			`lr_scheduler: LRScheduler = None,`
			`) -> List[Union[nn.Module, Optimizer, LRScheduler, DataLoader]]:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`"""`
			`Boost the model, optimizer, criterion, lr_scheduler, and dataloader.`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`Args:`
			`model (nn.Module): The model to be boosted.`
			`optimizer (Optimizer): The optimizer to be boosted.`
			`criterion (Callable): The criterion to be boosted.`
			`dataloader (DataLoader): The dataloader to be boosted.`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`lr_scheduler (LRScheduler): The lr_scheduler to be boosted.`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`"""`
			`# TODO(FrankLeeeee): consider multi-model and multi-optimizer case`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`# TODO(FrankLeeeee): consider multi-dataloader case`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`# transform model for mixed precision`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`if self.plugin:`
			`model, optimizer, criterion, dataloader, lr_scheduler = self.plugin.configure(`
			`model, optimizer, criterion, dataloader, lr_scheduler)`

[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`if self.plugin and not self.plugin.control_device():`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`# transform model for accelerator`
			`model = self.accelerator.configure(model)`

[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`if self.mixed_precision and (self.plugin is None or self.plugin and not self.plugin.control_precision()):`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`# transform model for mixed precision`
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`# when mixed_precision is specified and the plugin is not given or does not control the precision`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`model, optimizer, criterion = self.mixed_precision.configure(model, optimizer, criterion)`

			`return model, optimizer, criterion, dataloader, lr_scheduler`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
			`def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:`
[booster] implemented mixed precision class (#3151) * [booster] implemented mixed precision class * polish code 2023-03-17 03:00:15 +00:00			`# TODO: implement this method with plugin`
			`optimizer.backward(loss)`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
			`def execute_pipeline(self,`
			`data_iter: Iterator,`
			`model: nn.Module,`
			`criterion: Callable[[torch.Tensor], torch.Tensor],`
			`optimizer: Optimizer,`
			`return_loss: bool = True,`
			`return_outputs: bool = False) -> Tuple[Optional[torch.Tensor], ...]:`
			`# TODO: implement this method`
			`# run pipeline forward backward pass`
			`# return loss or outputs if needed`
			`pass`

			`def no_sync(self, model: nn.Module) -> contextmanager:`
[booster] added the plugin base and torch ddp plugin (#3180) * [booster] added the plugin base and torch ddp plugin * polish code * polish code * polish code 2023-03-21 09:39:30 +00:00			`assert self.plugin is not None, f'no_sync is only enabled when a plugin is provided and the plugin supports no_sync.'`
			`assert self.plugin.support_no_sync, f'The plugin {self.plugin.__class__.__name__} does not support no_sync.'`
			`return self.plugin.no_sync(model)`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`def load_model(self, model: nn.Module, checkpoint: str, strict: bool = True):`
			`self.checkpoint_io.load_model(model, checkpoint, strict)`
[booster] init module structure and definition (#3056) 2023-03-09 03:27:46 +00:00
[booster] implemented the torch ddd + resnet example (#3232) * [booster] implemented the torch ddd + resnet example * polish code 2023-03-27 02:24:14 +00:00			`def save_model(self,`
			`model: nn.Module,`
			`checkpoint: str,`
			`prefix: str = None,`
			`shard: bool = False,`
			`size_per_shard: int = 1024):`
			`self.checkpoint_io.save_model(model, checkpoint, prefix, shard, size_per_shard)`

			`def load_optimizer(self, optimizer: Optimizer, checkpoint: str):`
			`self.checkpoint_io.load_optimizer(optimizer, checkpoint)`

			`def save_optimizer(self, optimizer: Optimizer, checkpoint: str, shard: bool = False, size_per_shard: int = 1024):`
			`self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, size_per_shard)`

			`def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):`
			`self.checkpoint_io.save_lr_scheduler(lr_scheduler, checkpoint)`

			`def load_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):`
			`self.checkpoint_io.load_lr_scheduler(lr_scheduler, checkpoint)`