ColossalAI/colossalai/engine/schedule/_base_schedule.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from abc import ABC, abstractmethod

import torch

from colossalai.core import global_context as gpc
from colossalai.logging import get_global_dist_logger
from colossalai.utils import get_current_device


class BaseSchedule(ABC):
    """A basic helper class to control the process of training or evaluation.
    It mainly composes of forward_backward_step for gradient backward and
    optimizer_step for parameters update.
    For the convenience to enable FP16, we aggreate all codes that contain the
    control of FP16 in class schedule.
    """

    def __init__(self):
        self.logger = get_global_dist_logger()

    @staticmethod
    def _move_tensor(element):
        if torch.is_tensor(element):
            if not element.is_cuda:
                return element.to(get_current_device()).detach()
        return element

    def _move_to_device(self, data):
        if isinstance(data, (tuple, list)):
            data = tuple([self._move_tensor(d) for d in data])
        elif torch.is_tensor(data):
            data = data.to(get_current_device()).detach()
        return data

    def load_batch(self, data_iter):
        """Loads a batch from data iterator. It returns the data and labels which are
        already in the same GPU as where the model's.

        :return: (data, label)
        :rtype: (Tensor, Tensor)
        """
        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')
        data, label = next(data_iter)
        return self._move_to_device(data), self._move_to_device(label)

    def initialize(self, model, optimizer):
        """Initializes the model and the optimizer before training.
         This is often used in FP16 training.

        :param model: The neural network model
        :param optimizer: Optimizer for updating the parameters
        """
        return model, optimizer

    @abstractmethod
    def forward_backward_step(self,
                              data_iter,
                              model,
                              criterion,
                              optimizer=None,
                              forward_only=False,
                              grad_accum_size: int = 1,
                              return_loss=True):
        """The process function over a batch of dataset for training or evaluation.

        :param data_iter: Data iterator of the dataset
        :param model: Model used in training or evaluation
        :param optimizer: Optimizer used in training or evaluation
        :param criterion: Loss function
        :param forward_only: If True, the process won't include backward
        :param grad_accum_size: Steps of gradient accumulation
        :param return_loss: If False, the loss won't be returned
        """
        pass

    @abstractmethod
    def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
        """Updates the parameters with the optimizer.

        :param model: The neural network model
        :param optimizer: Optimizer for updating the parameters
        :param grad_clipping: The norm of gradient clipping
        :type grad_clipping: float, optional
        """
        pass