InternLM/internlm/core/engine.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine

from typing import List, Optional

import torch
from torch.nn import Module
from torch.nn.modules.loss import _Loss
from torch.optim.lr_scheduler import _LRScheduler

from internlm.core.gradient_handler import BaseGradientHandler
from internlm.solver.beta2_scheduler import Beta2Scheduler
from internlm.solver.optimizer.hybrid_zero_optim import BaseOptimizer
from internlm.utils.common import get_batch_size, move_to_device


class Engine:
    """
    The Engine class is responsible for managing the training and evaluation process of a neural network model.
    It handles the forward and backward passes, parameter updates, gradient handling, and mode switching between
    training and evaluation.

    Args:
        model (torch.nn.Module): The neural network model to be trained or evaluated.
        optimizer (BaseOptimizer): The optimizer used for updating the parameters of the model.
        lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): The learning rate scheduler for the optimizer.
                                                                        Default is None.
        beta2_scheduler (internlm.solver.beta2_scheduler.Beta2Scheduler, optional): The beta2 scheduler for the
                                                                                    optimizer. Default is None.
        criterion (torch.nn.modules.loss._Loss, optional): The loss function used for calculating the loss during
                                                           training. Default is None.
        gradient_handlers (List[BaseGradientHandler], optional): A list of gradient handlers used in the backward pass.
                                                                 Default is None.
        clip_grad_norm (float, optional): The norm value for gradient clipping. Default is 0.0.

    Examples:
        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
        >>> model = ...
        >>> criterion = ...
        >>> optimizer = ...
        >>> train_dataloader = ...
        >>> engine, _, _, _ = internlm.initialize_engine(model, optimizer, criterion)
        >>> engine.train()
        >>> for inputs, labels in train_dataloader
        >>>     # set gradients to zero
        >>>     engine.zero_grad()
        >>>     # run forward pass
        >>>     outputs = engine(inputs)
        >>>     # compute loss value and run backward pass
        >>>     loss = engine.criterion(outputs, labels)
        >>>     engine.backward(loss)
        >>>     # update parameters
        >>>     engine.step()
    """

    def __init__(
        self,
        model: Module,
        optimizer: BaseOptimizer,
        lr_scheduler: Optional[_LRScheduler] = None,
        beta2_scheduler: Optional[Beta2Scheduler] = None,
        criterion: Optional[_Loss] = None,
        gradient_handlers: Optional[List[BaseGradientHandler]] = None,
        clip_grad_norm: float = 0.0,
    ):
        self._model = model
        self._optimizer = optimizer
        self._lr_scheduler = lr_scheduler
        self._beta2_scheduler = beta2_scheduler
        self._criterion = criterion
        self._clip_grad_norm = clip_grad_norm

        # state
        self.training = True  # default

        # build gradient handler
        self._gradient_handlers = gradient_handlers if gradient_handlers else []

    @property
    def model(self):
        """Returns the model attached to the engine."""
        return self._model

    @property
    def optimizer(self):
        """Returns the optimizer attached to the engine."""
        return self._optimizer

    @property
    def criterion(self):
        """Returns the criterion (loss function) attached to the engine."""
        return self._criterion

    def _all_reduce_gradients(self):
        """Handles all-reduce operations of gradients across different parallel groups."""
        for handler in self._gradient_handlers:
            handler.handle_gradient()

    def zero_grad(self):
        """Sets the gradient of all parameters in the model to zero."""
        self.optimizer.zero_grad()

    def step(self):
        """
        Executes the parameter update step. This includes all-reduce operations of gradients, gradient clipping,
        and parameter update. If successful, it also steps the learning rate scheduler and beta2 scheduler
        if they exist.

        Returns:
            success (bool): Whether the parameter update was successful.
            grad_norm (float): The norm of the gradient after clipping.
        """
        self._all_reduce_gradients()
        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)

        success, grad_norm = self.optimizer.step()

        if success and self._lr_scheduler is not None:
            self._lr_scheduler.step()

        if success and self._beta2_scheduler is not None:
            self._beta2_scheduler.step()

        return success, grad_norm

    def train(self):
        """Sets the model to training mode."""
        self.training = True
        self._model.train()

    def eval(self):
        """Sets the model to evaluation mode."""
        self.training = False
        self._model.eval()

    def backward(self, loss: torch.Tensor):
        """
        Starts the backward propagation given the loss value computed by a loss function.

        Args:
            loss (torch.Tensor): The loss value computed by a loss function.
        """
        return self.optimizer.backward(loss)

    def backward_by_grad(self, tensor, grad):
        """
        Starts the backward propagation given the gradient of the output tensor.

        Args:
            tensor (torch.Tensor): The output tensor.
            grad (torch.Tensor): The gradient passed back to the output tensor.
        """
        return self.optimizer.backward_by_grad(tensor, grad)

    def __call__(self, *args, **kwargs):
        """
        Runs the forward step for the model.

        Returns:
            torch.Tensor: The output of the model.
        """
        return self.model(*args, **kwargs)

    def load_batch(self, data_iter, to_gpu=True):
        """
        Loads a batch from the data iterator. It returns the data and labels which are
        already in the same GPU as where the model is.

        Args:
            data_iter (Iterable): The data iterator from which to get a batch of data, obtained by calling
                                  iter(dataloader).
            to_gpu (bool, optional): Whether the data should be moved to the GPU. Default is True.

        Returns:
            Tuple (torch.Tensor, torch.Tensor): A tuple of (data, label).
        """
        if data_iter is None:
            raise RuntimeError("Dataloader is not defined.")
        try:
            batch_data = next(data_iter)
        except TypeError:
            batch_data = data_iter

        if to_gpu:
            batch_data = move_to_device(batch_data)
        batch_size = get_batch_size(batch_data)

        return batch_data, batch_size
initial commit 2023-07-06 04:55:23 +00:00			`#!/usr/bin/env python`
			`# -- encoding: utf-8 --`

			`# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine`

			`from typing import List, Optional`

			`import torch`
			`from torch.nn import Module`
			`from torch.nn.modules.loss import _Loss`
			`from torch.optim.lr_scheduler import _LRScheduler`

			`from internlm.core.gradient_handler import BaseGradientHandler`
			`from internlm.solver.beta2_scheduler import Beta2Scheduler`
			`from internlm.solver.optimizer.hybrid_zero_optim import BaseOptimizer`
			`from internlm.utils.common import get_batch_size, move_to_device`


			`class Engine:`
			`"""`
			`The Engine class is responsible for managing the training and evaluation process of a neural network model.`
			`It handles the forward and backward passes, parameter updates, gradient handling, and mode switching between`
			`training and evaluation.`

			`Args:`
			`model (torch.nn.Module): The neural network model to be trained or evaluated.`
			`optimizer (BaseOptimizer): The optimizer used for updating the parameters of the model.`
			`lr_scheduler (torch.optim.lr_scheduler._LRScheduler, optional): The learning rate scheduler for the optimizer.`
			`Default is None.`
			`beta2_scheduler (internlm.solver.beta2_scheduler.Beta2Scheduler, optional): The beta2 scheduler for the`
			`optimizer. Default is None.`
			`criterion (torch.nn.modules.loss._Loss, optional): The loss function used for calculating the loss during`
			`training. Default is None.`
			`gradient_handlers (List[BaseGradientHandler], optional): A list of gradient handlers used in the backward pass.`
			`Default is None.`
			`clip_grad_norm (float, optional): The norm value for gradient clipping. Default is 0.0.`

			`Examples:`
			`>>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training`
			`>>> model = ...`
			`>>> criterion = ...`
			`>>> optimizer = ...`
			`>>> train_dataloader = ...`
			`>>> engine, _, _, _ = internlm.initialize_engine(model, optimizer, criterion)`
			`>>> engine.train()`
			`>>> for inputs, labels in train_dataloader`
			`>>> # set gradients to zero`
			`>>> engine.zero_grad()`
			`>>> # run forward pass`
			`>>> outputs = engine(inputs)`
			`>>> # compute loss value and run backward pass`
			`>>> loss = engine.criterion(outputs, labels)`
			`>>> engine.backward(loss)`
			`>>> # update parameters`
			`>>> engine.step()`
			`"""`

			`def __init__(`
			`self,`
			`model: Module,`
			`optimizer: BaseOptimizer,`
			`lr_scheduler: Optional[_LRScheduler] = None,`
			`beta2_scheduler: Optional[Beta2Scheduler] = None,`
			`criterion: Optional[_Loss] = None,`
			`gradient_handlers: Optional[List[BaseGradientHandler]] = None,`
			`clip_grad_norm: float = 0.0,`
			`):`
			`self._model = model`
			`self._optimizer = optimizer`
			`self._lr_scheduler = lr_scheduler`
			`self._beta2_scheduler = beta2_scheduler`
			`self._criterion = criterion`
			`self._clip_grad_norm = clip_grad_norm`

			`# state`
			`self.training = True # default`

			`# build gradient handler`
			`self._gradient_handlers = gradient_handlers if gradient_handlers else []`

			`@property`
			`def model(self):`
			`"""Returns the model attached to the engine."""`
			`return self._model`

			`@property`
			`def optimizer(self):`
			`"""Returns the optimizer attached to the engine."""`
			`return self._optimizer`

			`@property`
			`def criterion(self):`
			`"""Returns the criterion (loss function) attached to the engine."""`
			`return self._criterion`

			`def _all_reduce_gradients(self):`
			`"""Handles all-reduce operations of gradients across different parallel groups."""`
			`for handler in self._gradient_handlers:`
			`handler.handle_gradient()`

			`def zero_grad(self):`
			`"""Sets the gradient of all parameters in the model to zero."""`
			`self.optimizer.zero_grad()`

			`def step(self):`
			`"""`
			`Executes the parameter update step. This includes all-reduce operations of gradients, gradient clipping,`
			`and parameter update. If successful, it also steps the learning rate scheduler and beta2 scheduler`
			`if they exist.`

			`Returns:`
			`success (bool): Whether the parameter update was successful.`
			`grad_norm (float): The norm of the gradient after clipping.`
			`"""`
			`self._all_reduce_gradients()`
			`self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)`

			`success, grad_norm = self.optimizer.step()`

			`if success and self._lr_scheduler is not None:`
			`self._lr_scheduler.step()`

			`if success and self._beta2_scheduler is not None:`
			`self._beta2_scheduler.step()`

			`return success, grad_norm`

			`def train(self):`
			`"""Sets the model to training mode."""`
			`self.training = True`
			`self._model.train()`

			`def eval(self):`
			`"""Sets the model to evaluation mode."""`
			`self.training = False`
			`self._model.eval()`

			`def backward(self, loss: torch.Tensor):`
			`"""`
			`Starts the backward propagation given the loss value computed by a loss function.`

			`Args:`
			`loss (torch.Tensor): The loss value computed by a loss function.`
			`"""`
			`return self.optimizer.backward(loss)`

			`def backward_by_grad(self, tensor, grad):`
			`"""`
			`Starts the backward propagation given the gradient of the output tensor.`

			`Args:`
			`tensor (torch.Tensor): The output tensor.`
			`grad (torch.Tensor): The gradient passed back to the output tensor.`
			`"""`
			`return self.optimizer.backward_by_grad(tensor, grad)`

			`def __call__(self, args, *kwargs):`
			`"""`
			`Runs the forward step for the model.`

			`Returns:`
			`torch.Tensor: The output of the model.`
			`"""`
			`return self.model(args, *kwargs)`

			`def load_batch(self, data_iter, to_gpu=True):`
			`"""`
			`Loads a batch from the data iterator. It returns the data and labels which are`
			`already in the same GPU as where the model is.`

			`Args:`
			`data_iter (Iterable): The data iterator from which to get a batch of data, obtained by calling`
			`iter(dataloader).`
			`to_gpu (bool, optional): Whether the data should be moved to the GPU. Default is True.`

			`Returns:`
			`Tuple (torch.Tensor, torch.Tensor): A tuple of (data, label).`
			`"""`
			`if data_iter is None:`
			`raise RuntimeError("Dataloader is not defined.")`
			`try:`
			`batch_data = next(data_iter)`
			`except TypeError:`
			`batch_data = data_iter`

			`if to_gpu:`
			`batch_data = move_to_device(batch_data)`
			`batch_size = get_batch_size(batch_data)`

			`return batch_data, batch_size`