ColossalAI/colossalai/engine/schedule/_pipeline_schedule_v2.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from typing import Iterable, Tuple

import torch.cuda

import colossalai.communication.p2p_v2 as comm
from colossalai import engine
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.utils.cuda import get_current_device

from ._pipeline_schedule import PipelineSchedule


def pack_return_tensors(return_tensors):
    output, label = tuple(zip(*return_tensors))
    if isinstance(output[0], torch.Tensor):
        output = torch.cat(output, dim=0)
    elif isinstance(output[0], (list, tuple)):
        output = tuple(torch.cat(tensors, dim=0) for tensors in zip(*output))
    else:
        raise TypeError(f'Output of model must be tensor or list/tuple of tensors')
    if isinstance(label[0], torch.Tensor):
        label = torch.cat(label, dim=0)
    else:
        merged_label = {k: [] for k in label[0].keys()}
        for d in label:
            for k, v in d.items():
                merged_label[k].append(v)
        label = {k: torch.cat(v, dim=0) for k, v in merged_label.items()}
    return output, label


class PipelineScheduleV2(PipelineSchedule):
    """Derived class of PipelineSchedule, the only difference is that
       forward_backward_step is reconstructed with p2p_v2

    Args:
        num_microbatches (int): The number of microbatches.
        data_process_func (Callable, optional):
            The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
        tensor_shape (torch.Size, optional): Specified shape in pipeline communication.
        scatter_gather_tensors (bool, optional):
            If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.

    Example:

        # this shows an example of customized data_process_func
        def data_process_func(stage_output, dataloader_output):
            output1, output2 = stage_output
            item1, item2, item3 = dataloader_output

            # assume item2 is not needed
            data = (output1, output2, item1)
            label = item3
            return data, label

    """

    def forward_backward_step(self,
                              engine: engine.Engine,
                              data_iter: Iterable,
                              forward_only=False,
                              return_loss=True,
                              return_output_label=True) -> Tuple[torch.Tensor]:
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

        Args:
            engine (colossalai.engine.Engine): Colossalai engine for training and inference.
            data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
            forward_only (bool, optional):
                Whether run forward step only. Default is false. If true, no backward will be run.
            return_loss (bool, optional): Whether returns the loss value. Default is true.
            return_output_label (bool, optional): If False, the output and label won't be returned.

        Returns:
            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
        """

        assert forward_only or return_loss, \
            'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
        self.load_batch(data_iter)

        # num_warmup_microbatches is the step when not all the processers are working
        num_warmup_microbatches = \
            (gpc.get_world_size(ParallelMode.PIPELINE)
             - gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
        num_warmup_microbatches = min(num_warmup_microbatches, self.num_microbatches)
        num_microbatches_remaining = self.num_microbatches - num_warmup_microbatches

        # Input, output tensors only need to be saved when doing backward passes
        input_objs = None
        output_objs = None
        # local_rank = gpc.get_local_rank(ParallelMode.PIPELINE)

        if not forward_only:
            input_objs = []
            output_objs = []
        return_tensors = []
        if return_loss and gpc.is_pipeline_last_stage(ignore_virtual=True):
            accum_loss = torch.zeros(1, device=get_current_device())
        else:
            accum_loss = None

        # Run warmup forward passes.
        for i in range(num_warmup_microbatches):
            input_obj = comm.recv_forward()

            output_obj = self._forward_step(engine,
                                            input_obj,
                                            return_tensors,
                                            return_output_label=return_output_label,
                                            accum_loss=accum_loss)

            comm.send_forward(output_obj)

            if not forward_only:
                input_objs.append(input_obj)
                output_objs.append(output_obj)

        # Before running 1F1B, need to receive first forward tensor.
        # If all microbatches are run in warmup / cooldown phase, then no need to
        # receive this tensor here.
        if num_microbatches_remaining > 0:
            input_obj = comm.recv_forward()

        # Run 1F1B in steady state.
        for i in range(num_microbatches_remaining):
            last_iteration = (i == (num_microbatches_remaining - 1))

            output_obj = self._forward_step(engine,
                                            input_obj,
                                            return_tensors,
                                            return_output_label=return_output_label,
                                            accum_loss=accum_loss)
            if forward_only:
                comm.send_forward(output_obj)

                if not last_iteration:
                    input_obj = comm.recv_forward()

            else:
                # TODO adjust here
                comm.send_forward(output_obj)
                output_obj_grad = comm.recv_backward()

                # Add input_obj and output_obj to end of list.
                input_objs.append(input_obj)
                output_objs.append(output_obj)

                # Pop output_obj and output_obj from the start of the list for
                # the backward pass.
                input_obj = input_objs.pop(0)
                output_obj = output_objs.pop(0)

                input_obj_grad = self._backward_step(engine, input_obj, output_obj, output_obj_grad)

                if last_iteration:
                    input_obj = None
                    comm.send_backward(input_obj_grad)
                else:
                    input_obj = comm.recv_forward()
                    comm.send_backward(input_obj_grad)

        # Run cooldown backward passes.
        if not forward_only:
            for i in range(num_warmup_microbatches):
                input_obj = input_objs.pop(0)
                output_obj = output_objs.pop(0)

                output_obj_grad = comm.recv_backward()
                input_obj_grad = self._backward_step(engine, input_obj, output_obj, output_obj_grad)
                comm.send_backward(input_obj_grad)

        if len(return_tensors) > 0:
            output, label = pack_return_tensors(return_tensors)
            return output, label, accum_loss
        else:
            return None, None, accum_loss
[engin/schedule] use p2p_v2 to recontruct pipeline_schedule (#1408) * support p2p communication with any type of object \| pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) \| pass test * [communication] add p2p_v2.py to support communication with List[Any] * Delete _pipeline_schedule_v2.py * Delete test_cifar_with_data_pipeline_tensor_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * Delete p2p_v2.py * Delete test_boardcast_send_recv_v2.py * Delete test_object_list_p2p_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [communication] remove print code * [communication] remove print code * [engin/schedule] shorten the running time of testing file to prevent cancelling in CI 2022-08-12 03:33:26 +00:00			`#!/usr/bin/env python`
			`# -- encoding: utf-8 --`

[NFC] polish colossalai/engine/schedule/_pipeline_schedule_v2.py code style (#3275) 2023-03-28 06:31:38 +00:00			`from typing import Iterable, Tuple`
[engin/schedule] use p2p_v2 to recontruct pipeline_schedule (#1408) * support p2p communication with any type of object \| pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) \| pass test * [communication] add p2p_v2.py to support communication with List[Any] * Delete _pipeline_schedule_v2.py * Delete test_cifar_with_data_pipeline_tensor_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * Delete p2p_v2.py * Delete test_boardcast_send_recv_v2.py * Delete test_object_list_p2p_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [communication] remove print code * [communication] remove print code * [engin/schedule] shorten the running time of testing file to prevent cancelling in CI 2022-08-12 03:33:26 +00:00
			`import torch.cuda`
[NFC] polish colossalai/engine/schedule/_pipeline_schedule_v2.py code style (#3275) 2023-03-28 06:31:38 +00:00
			`import colossalai.communication.p2p_v2 as comm`
			`from colossalai import engine`
[engin/schedule] use p2p_v2 to recontruct pipeline_schedule (#1408) * support p2p communication with any type of object \| pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) \| pass test * [communication] add p2p_v2.py to support communication with List[Any] * Delete _pipeline_schedule_v2.py * Delete test_cifar_with_data_pipeline_tensor_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * Delete p2p_v2.py * Delete test_boardcast_send_recv_v2.py * Delete test_object_list_p2p_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [communication] remove print code * [communication] remove print code * [engin/schedule] shorten the running time of testing file to prevent cancelling in CI 2022-08-12 03:33:26 +00:00			`from colossalai.context.parallel_mode import ParallelMode`
			`from colossalai.core import global_context as gpc`
			`from colossalai.utils.cuda import get_current_device`

			`from ._pipeline_schedule import PipelineSchedule`


			`def pack_return_tensors(return_tensors):`
			`output, label = tuple(zip(*return_tensors))`
			`if isinstance(output[0], torch.Tensor):`
			`output = torch.cat(output, dim=0)`
			`elif isinstance(output[0], (list, tuple)):`
			`output = tuple(torch.cat(tensors, dim=0) for tensors in zip(*output))`
			`else:`
			`raise TypeError(f'Output of model must be tensor or list/tuple of tensors')`
			`if isinstance(label[0], torch.Tensor):`
			`label = torch.cat(label, dim=0)`
			`else:`
			`merged_label = {k: [] for k in label[0].keys()}`
			`for d in label:`
			`for k, v in d.items():`
			`merged_label[k].append(v)`
			`label = {k: torch.cat(v, dim=0) for k, v in merged_label.items()}`
			`return output, label`


			`class PipelineScheduleV2(PipelineSchedule):`
			`"""Derived class of PipelineSchedule, the only difference is that`
			`forward_backward_step is reconstructed with p2p_v2`
[NFC] polish colossalai/engine/schedule/_pipeline_schedule_v2.py code style (#3275) 2023-03-28 06:31:38 +00:00
[engin/schedule] use p2p_v2 to recontruct pipeline_schedule (#1408) * support p2p communication with any type of object \| pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) \| pass test * [communication] add p2p_v2.py to support communication with List[Any] * Delete _pipeline_schedule_v2.py * Delete test_cifar_with_data_pipeline_tensor_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * Delete p2p_v2.py * Delete test_boardcast_send_recv_v2.py * Delete test_object_list_p2p_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [communication] remove print code * [communication] remove print code * [engin/schedule] shorten the running time of testing file to prevent cancelling in CI 2022-08-12 03:33:26 +00:00			`Args:`
			`num_microbatches (int): The number of microbatches.`
			`data_process_func (Callable, optional):`
			The preprocessing function which receives a batch of data, and it will be executed in `load_batch`.
			`tensor_shape (torch.Size, optional): Specified shape in pipeline communication.`
			`scatter_gather_tensors (bool, optional):`
			If set to `True`, communication will be reduced over pipeline when using 1D tensor parallelization.
[NFC] polish colossalai/engine/schedule/_pipeline_schedule_v2.py code style (#3275) 2023-03-28 06:31:38 +00:00
[engin/schedule] use p2p_v2 to recontruct pipeline_schedule (#1408) * support p2p communication with any type of object \| pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) \| pass test * [communication] add p2p_v2.py to support communication with List[Any] * Delete _pipeline_schedule_v2.py * Delete test_cifar_with_data_pipeline_tensor_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * Delete p2p_v2.py * Delete test_boardcast_send_recv_v2.py * Delete test_object_list_p2p_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [communication] remove print code * [communication] remove print code * [engin/schedule] shorten the running time of testing file to prevent cancelling in CI 2022-08-12 03:33:26 +00:00			`Example:`
[NFC] polish colossalai/engine/schedule/_pipeline_schedule_v2.py code style (#3275) 2023-03-28 06:31:38 +00:00
[engin/schedule] use p2p_v2 to recontruct pipeline_schedule (#1408) * support p2p communication with any type of object \| pass test * reconstruct pipeline schedule with p2p_v2.py(support communication with List[Any]) \| pass test * [communication] add p2p_v2.py to support communication with List[Any] * Delete _pipeline_schedule_v2.py * Delete test_cifar_with_data_pipeline_tensor_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * Delete p2p_v2.py * Delete test_boardcast_send_recv_v2.py * Delete test_object_list_p2p_v2.py * [engin/schedule] use p2p_v2 to recontruct pipeline_schedule * [communication] remove print code * [communication] remove print code * [engin/schedule] shorten the running time of testing file to prevent cancelling in CI 2022-08-12 03:33:26 +00:00			`# this shows an example of customized data_process_func`
			`def data_process_func(stage_output, dataloader_output):`
			`output1, output2 = stage_output`
			`item1, item2, item3 = dataloader_output`

			`# assume item2 is not needed`
			`data = (output1, output2, item1)`
			`label = item3`
			`return data, label`

			`"""`

			`def forward_backward_step(self,`
			`engine: engine.Engine,`
			`data_iter: Iterable,`
			`forward_only=False,`
			`return_loss=True,`
			`return_output_label=True) -> Tuple[torch.Tensor]:`
			`"""Runs non-interleaved 1F1B schedule, with communication between pipeline stages.`
			`Returns a tuple with losses if the last stage, an empty tuple otherwise.`

			`Args:`
			`engine (colossalai.engine.Engine): Colossalai engine for training and inference.`
			`data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).`
			`forward_only (bool, optional):`
			`Whether run forward step only. Default is false. If true, no backward will be run.`
			`return_loss (bool, optional): Whether returns the loss value. Default is true.`
			`return_output_label (bool, optional): If False, the output and label won't be returned.`

			`Returns:`
			Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None.
			`"""`

			`assert forward_only or return_loss, \`
			`'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'`
			`self.load_batch(data_iter)`

			`# num_warmup_microbatches is the step when not all the processers are working`
			`num_warmup_microbatches = \`
			`(gpc.get_world_size(ParallelMode.PIPELINE)`
			`- gpc.get_local_rank(ParallelMode.PIPELINE) - 1)`
			`num_warmup_microbatches = min(num_warmup_microbatches, self.num_microbatches)`
			`num_microbatches_remaining = self.num_microbatches - num_warmup_microbatches`

			`# Input, output tensors only need to be saved when doing backward passes`
			`input_objs = None`
			`output_objs = None`
			`# local_rank = gpc.get_local_rank(ParallelMode.PIPELINE)`

			`if not forward_only:`
			`input_objs = []`
			`output_objs = []`
			`return_tensors = []`
			`if return_loss and gpc.is_pipeline_last_stage(ignore_virtual=True):`
			`accum_loss = torch.zeros(1, device=get_current_device())`
			`else:`
			`accum_loss = None`

			`# Run warmup forward passes.`
			`for i in range(num_warmup_microbatches):`
			`input_obj = comm.recv_forward()`

			`output_obj = self._forward_step(engine,`
			`input_obj,`
			`return_tensors,`
			`return_output_label=return_output_label,`
			`accum_loss=accum_loss)`

			`comm.send_forward(output_obj)`

			`if not forward_only:`
			`input_objs.append(input_obj)`
			`output_objs.append(output_obj)`

			`# Before running 1F1B, need to receive first forward tensor.`
			`# If all microbatches are run in warmup / cooldown phase, then no need to`
			`# receive this tensor here.`
			`if num_microbatches_remaining > 0:`
			`input_obj = comm.recv_forward()`

			`# Run 1F1B in steady state.`
			`for i in range(num_microbatches_remaining):`
			`last_iteration = (i == (num_microbatches_remaining - 1))`

			`output_obj = self._forward_step(engine,`
			`input_obj,`
			`return_tensors,`
			`return_output_label=return_output_label,`
			`accum_loss=accum_loss)`
			`if forward_only:`
			`comm.send_forward(output_obj)`

			`if not last_iteration:`
			`input_obj = comm.recv_forward()`

			`else:`
			`# TODO adjust here`
			`comm.send_forward(output_obj)`
			`output_obj_grad = comm.recv_backward()`

			`# Add input_obj and output_obj to end of list.`
			`input_objs.append(input_obj)`
			`output_objs.append(output_obj)`

			`# Pop output_obj and output_obj from the start of the list for`
			`# the backward pass.`
			`input_obj = input_objs.pop(0)`
			`output_obj = output_objs.pop(0)`

			`input_obj_grad = self._backward_step(engine, input_obj, output_obj, output_obj_grad)`

			`if last_iteration:`
			`input_obj = None`
			`comm.send_backward(input_obj_grad)`
			`else:`
			`input_obj = comm.recv_forward()`
			`comm.send_backward(input_obj_grad)`

			`# Run cooldown backward passes.`
			`if not forward_only:`
			`for i in range(num_warmup_microbatches):`
			`input_obj = input_objs.pop(0)`
			`output_obj = output_objs.pop(0)`

			`output_obj_grad = comm.recv_backward()`
			`input_obj_grad = self._backward_step(engine, input_obj, output_obj, output_obj_grad)`
			`comm.send_backward(input_obj_grad)`

			`if len(return_tensors) > 0:`
			`output, label = pack_return_tensors(return_tensors)`
			`return output, label, accum_loss`
			`else:`
			`return None, None, accum_loss`