ColossalAI/colossalai/nn/layer/parallel_1d/_utils.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import torch
import torch.distributed as dist
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env

from ..utils import divide


def set_parallel_input(input_parallel: bool):
    env.parallel_input_1d = input_parallel


def get_parallel_input():
    return env.parallel_input_1d


def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank):
    index_f = rank * per_partition_vocab_size
    index_l = index_f + per_partition_vocab_size
    return index_f, index_l


def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
    per_partition_vocab_size = divide(global_vocab_size, world_size)
    return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank)


def _reduce(input_, parallel_mode):
    # skip if only one rank involved
    if gpc.get_world_size(parallel_mode) == 1:
        return input_
    dist.all_reduce(input_, group=gpc.get_group(parallel_mode))

    return input_


def _split(input_, parallel_mode, dim=-1):
    # skip if only one rank involved
    world_size = gpc.get_world_size(parallel_mode)
    if world_size == 1:
        return input_

    # Split along last dimension.
    dim_size = input_.size(dim)
    assert dim_size % world_size == 0, \
        f'The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), ' \
        f'cannot split tensor evenly'

    tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
    rank = gpc.get_local_rank(parallel_mode)
    output = tensor_list[rank].contiguous()

    return output


def _gather(input_, parallel_mode, dim=-1):
    # skip if only one rank involved
    world_size = gpc.get_world_size(parallel_mode)
    if world_size == 1:
        return input_

    # all gather
    rank = gpc.get_local_rank(parallel_mode)
    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
    tensor_list[rank] = input_
    torch.distributed.all_gather(tensor_list, input_, group=gpc.get_group(parallel_mode))

    # concat
    output = torch.cat(tensor_list, dim=dim).contiguous()

    return output


class _ReduceGrad(torch.autograd.Function):
    """
    Pass the input to the model parallel region.

    :param input_: input matrix
    :param parallel_mode: parallel mode
    """
    @staticmethod
    def symbolic(graph, input_):
        return input_

    @staticmethod
    def forward(ctx, input_, parallel_mode):
        ctx.mode = parallel_mode
        return input_

    @staticmethod
    def backward(ctx, grad_output):
        return _reduce(grad_output, ctx.mode), None


class _ReduceInput(torch.autograd.Function):
    """
    All-reduce the input from the model parallel region.
    
    :param input_: input matrix
    :param parallel_mode: parallel mode
    """
    @staticmethod
    def symbolic(graph, input_):
        return _reduce(input_)

    @staticmethod
    def forward(ctx, input_, parallel_mode):
        return _reduce(input_, parallel_mode)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, None


class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
    
    :param input_: input matrix
    :param parallel_mode: parallel mode
    :param dim: dimension
    """
    @staticmethod
    def symbolic(graph, input_):
        return _split(input_)

    @staticmethod
    def forward(ctx, input_, parallel_mode, dim):
        ctx.mode = parallel_mode
        ctx.dim = dim
        return _split(input_, parallel_mode, dim)

    @staticmethod
    def backward(ctx, grad_output):
        return _gather(grad_output, ctx.mode, ctx.dim), None, None


class _GatherForwardSplitBackward(torch.autograd.Function):
    """
    Gather the input from model parallel region and concatinate.
    
    :param input_: input matrix
    :param parallel_mode: parallel mode
    :param dim: dimension
    """
    @staticmethod
    def symbolic(graph, input_):
        return _gather(input_)

    @staticmethod
    def forward(ctx, input_, parallel_mode, dim):
        ctx.mode = parallel_mode
        ctx.dim = dim
        return _gather(input_, parallel_mode, dim)

    @staticmethod
    def backward(ctx, grad_output):
        return _split(grad_output, ctx.mode, ctx.dim), None, None


def reduce_grad(input_, parallel_mode):
    return _ReduceGrad.apply(input_, parallel_mode)


def reduce_input(input_, parallel_mode):
    return _ReduceInput.apply(input_, parallel_mode)


def split_forward_gather_backward(input_, parallel_mode, dim):
    return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim)


def gather_forward_split_backward(input_, parallel_mode, dim):
    return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim)
Migrated project 2021-10-28 16:21:23 +00:00			`#!/usr/bin/env python`
			`# -- encoding: utf-8 --`

Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`import torch`
			`import torch.distributed as dist`
			`from colossalai.core import global_context as gpc`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`from colossalai.global_variables import tensor_parallel_env as env`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00
Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-29 15:32:10 +00:00			`from ..utils import divide`


			`def set_parallel_input(input_parallel: bool):`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`env.parallel_input_1d = input_parallel`
Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-29 15:32:10 +00:00

			`def get_parallel_input():`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`return env.parallel_input_1d`
Migrated project 2021-10-28 16:21:23 +00:00

			`def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank):`
			`index_f = rank * per_partition_vocab_size`
			`index_l = index_f + per_partition_vocab_size`
			`return index_f, index_l`


			`def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):`
			`per_partition_vocab_size = divide(global_vocab_size, world_size)`
			`return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank)`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-09 07:08:29 +00:00

Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`def _reduce(input_, parallel_mode):`
			`# skip if only one rank involved`
			`if gpc.get_world_size(parallel_mode) == 1:`
			`return input_`
			`dist.all_reduce(input_, group=gpc.get_group(parallel_mode))`

			`return input_`


			`def _split(input_, parallel_mode, dim=-1):`
			`# skip if only one rank involved`
			`world_size = gpc.get_world_size(parallel_mode)`
			`if world_size == 1:`
			`return input_`

			`# Split along last dimension.`
			`dim_size = input_.size(dim)`
			`assert dim_size % world_size == 0, \`
			`f'The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), ' \`
			`f'cannot split tensor evenly'`

			`tensor_list = torch.split(input_, dim_size // world_size, dim=dim)`
			`rank = gpc.get_local_rank(parallel_mode)`
			`output = tensor_list[rank].contiguous()`

			`return output`


			`def _gather(input_, parallel_mode, dim=-1):`
			`# skip if only one rank involved`
			`world_size = gpc.get_world_size(parallel_mode)`
			`if world_size == 1:`
			`return input_`

			`# all gather`
			`rank = gpc.get_local_rank(parallel_mode)`
			`tensor_list = [torch.empty_like(input_) for _ in range(world_size)]`
			`tensor_list[rank] = input_`
			`torch.distributed.all_gather(tensor_list, input_, group=gpc.get_group(parallel_mode))`

			`# concat`
			`output = torch.cat(tensor_list, dim=dim).contiguous()`

			`return output`


			`class _ReduceGrad(torch.autograd.Function):`
Update layer integration documentations (#108) Update the documentations of layer integration Update _log_hook.py Update _operation.py 2022-01-10 10:05:58 +00:00			`"""`
			`Pass the input to the model parallel region.`

			`:param input_: input matrix`
			`:param parallel_mode: parallel mode`
			`"""`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`@staticmethod`
			`def symbolic(graph, input_):`
			`return input_`

			`@staticmethod`
			`def forward(ctx, input_, parallel_mode):`
			`ctx.mode = parallel_mode`
			`return input_`

			`@staticmethod`
			`def backward(ctx, grad_output):`
			`return _reduce(grad_output, ctx.mode), None`


			`class _ReduceInput(torch.autograd.Function):`
Update layer integration documentations (#108) Update the documentations of layer integration Update _log_hook.py Update _operation.py 2022-01-10 10:05:58 +00:00			`"""`
			`All-reduce the input from the model parallel region.`

			`:param input_: input matrix`
			`:param parallel_mode: parallel mode`
			`"""`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`@staticmethod`
			`def symbolic(graph, input_):`
			`return _reduce(input_)`

			`@staticmethod`
			`def forward(ctx, input_, parallel_mode):`
			`return _reduce(input_, parallel_mode)`

			`@staticmethod`
			`def backward(ctx, grad_output):`
			`return grad_output, None`


			`class _SplitForwardGatherBackward(torch.autograd.Function):`
Update layer integration documentations (#108) Update the documentations of layer integration Update _log_hook.py Update _operation.py 2022-01-10 10:05:58 +00:00			`"""`
			`Split the input and keep only the corresponding chuck to the rank.`

			`:param input_: input matrix`
			`:param parallel_mode: parallel mode`
			`:param dim: dimension`
			`"""`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`@staticmethod`
			`def symbolic(graph, input_):`
			`return _split(input_)`

			`@staticmethod`
			`def forward(ctx, input_, parallel_mode, dim):`
			`ctx.mode = parallel_mode`
			`ctx.dim = dim`
			`return _split(input_, parallel_mode, dim)`

			`@staticmethod`
			`def backward(ctx, grad_output):`
			`return _gather(grad_output, ctx.mode, ctx.dim), None, None`


			`class _GatherForwardSplitBackward(torch.autograd.Function):`
Update layer integration documentations (#108) Update the documentations of layer integration Update _log_hook.py Update _operation.py 2022-01-10 10:05:58 +00:00			`"""`
			`Gather the input from model parallel region and concatinate.`

			`:param input_: input matrix`
			`:param parallel_mode: parallel mode`
			`:param dim: dimension`
			`"""`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`@staticmethod`
			`def symbolic(graph, input_):`
			`return _gather(input_)`

			`@staticmethod`
			`def forward(ctx, input_, parallel_mode, dim):`
			`ctx.mode = parallel_mode`
			`ctx.dim = dim`
			`return _gather(input_, parallel_mode, dim)`

			`@staticmethod`
			`def backward(ctx, grad_output):`
			`return _split(grad_output, ctx.mode, ctx.dim), None, None`


			`def reduce_grad(input_, parallel_mode):`
			`return _ReduceGrad.apply(input_, parallel_mode)`


			`def reduce_input(input_, parallel_mode):`
			`return _ReduceInput.apply(input_, parallel_mode)`


			`def split_forward_gather_backward(input_, parallel_mode, dim):`
			`return _SplitForwardGatherBackward.apply(input_, parallel_mode, dim)`

Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-09 07:08:29 +00:00
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`def gather_forward_split_backward(input_, parallel_mode, dim):`
			`return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim)`