|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
from colossalai.core import global_context as gpc
|
|
|
|
from colossalai.logging import get_global_dist_logger
|
|
|
|
from colossalai.utils import get_current_device
|
|
|
|
|
|
|
|
|
|
|
|
class BaseSchedule(ABC):
|
|
|
|
"""A basic helper class to control the process of training or evaluation.
|
|
|
|
It mainly composes of forward_backward_step for gradient backward and
|
|
|
|
optimizer_step for parameters update.
|
|
|
|
For the convenience to enable FP16, we aggreate all codes that contain the
|
|
|
|
control of FP16 in class schedule.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.logger = get_global_dist_logger()
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _move_tensor(element):
|
|
|
|
if torch.is_tensor(element):
|
|
|
|
if not element.is_cuda:
|
|
|
|
return element.to(get_current_device()).detach()
|
|
|
|
return element
|
|
|
|
|
|
|
|
def _move_to_device(self, data):
|
|
|
|
if isinstance(data, (tuple, list)):
|
|
|
|
data = tuple([self._move_tensor(d) for d in data])
|
|
|
|
elif torch.is_tensor(data):
|
|
|
|
data = data.to(get_current_device()).detach()
|
|
|
|
return data
|
|
|
|
|
|
|
|
def load_batch(self, data_iter):
|
|
|
|
"""Loads a batch from data iterator. It returns the data and labels which are
|
|
|
|
already in the same GPU as where the model's.
|
|
|
|
|
|
|
|
:return: (data, label)
|
|
|
|
:rtype: (Tensor, Tensor)
|
|
|
|
"""
|
|
|
|
if data_iter is None:
|
|
|
|
raise RuntimeError('Dataloader is not defined.')
|
|
|
|
data, label = next(data_iter)
|
|
|
|
return self._move_to_device(data), self._move_to_device(label)
|
|
|
|
|
|
|
|
def initialize(self, model, optimizer):
|
|
|
|
"""Initializes the model and the optimizer before training.
|
|
|
|
This is often used in FP16 training.
|
|
|
|
|
|
|
|
:param model: The neural network model
|
|
|
|
:param optimizer: Optimizer for updating the parameters
|
|
|
|
"""
|
|
|
|
return model, optimizer
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def forward_backward_step(self,
|
|
|
|
data_iter,
|
|
|
|
model,
|
|
|
|
criterion,
|
|
|
|
optimizer=None,
|
|
|
|
forward_only=False,
|
|
|
|
grad_accum_size: int = 1,
|
|
|
|
return_loss=True):
|
|
|
|
"""The process function over a batch of dataset for training or evaluation.
|
|
|
|
|
|
|
|
:param data_iter: Data iterator of the dataset
|
|
|
|
:param model: Model used in training or evaluation
|
|
|
|
:param optimizer: Optimizer used in training or evaluation
|
|
|
|
:param criterion: Loss function
|
|
|
|
:param forward_only: If True, the process won't include backward
|
|
|
|
:param grad_accum_size: Steps of gradient accumulation
|
|
|
|
:param return_loss: If False, the loss won't be returned
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
|
|
|
|
"""Updates the parameters with the optimizer.
|
|
|
|
|
|
|
|
:param model: The neural network model
|
|
|
|
:param optimizer: Optimizer for updating the parameters
|
|
|
|
:param grad_clipping: The norm of gradient clipping
|
|
|
|
:type grad_clipping: float, optional
|
|
|
|
"""
|
|
|
|
pass
|