ColossalAI/colossalai/nn/layer/parallel_3d/_utils.py

from collections import OrderedDict
from functools import partial

import torch
from torch import Tensor

from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env


def get_depth_from_env() -> int:
    try:
        depth = env.depth_3d
        assert depth > 0, 'DEPTH must be greater than zero'
        return depth

    except KeyError as e:
        raise EnvironmentError('DEPTH is not found in the current environment, '
                               'please make sure that you have used the correct process group initializer')


def get_parallel_mode_from_env(group):
    assert group in [INPUT_GROUP_3D, WEIGHT_GROUP_3D, OUTPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_X_WEIGHT_3D], \
        f'{group} is not valid for 3D tensor parallelism.'
    return getattr(env, group)


def swap_in_out_group():
    env.input_group_3d, env.output_group_3d = env.output_group_3d, env.input_group_3d
    env.input_x_weight_group_3d, env.output_x_weight_group_3d = (
        env.output_x_weight_group_3d,
        env.input_x_weight_group_3d,
    )


def dbg_check_shape(tensor: Tensor, shape: tuple):
    rank = gpc.get_global_rank()
    if rank == 0:
        print(tensor.shape)
    assert tensor.shape == shape, \
        '{} does not match {}'.format(tensor.shape, shape)


class AsyncGradientBucket(object):

    def __init__(self):
        self.bucket = OrderedDict()

    def __len__(self):
        return len(self.bucket)

    def push(self, async_op, grad_tensor, param_id):
        self.bucket[param_id] = tuple((async_op, grad_tensor))
        return torch.zeros_like(grad_tensor, dtype=grad_tensor.dtype, device=grad_tensor.device)

    def pop(self, param_id):
        grad = None
        if param_id in self.bucket:
            op, grad = self.bucket.pop(param_id)
            if op is not None:
                op.wait()
        return grad

    def synchronize(self, params):
        for p in params:
            i = id(p)
            if i in self.bucket:
                op, grad = self.bucket.pop(i)
                if op is not None:
                    op.wait()
                p.grad.add_(grad)


_async_grad_bucket = AsyncGradientBucket()


def push_async_grad(op, grad, param_id):
    return _async_grad_bucket.push(op, grad, param_id)


def pop_async_grad(param_id):
    return _async_grad_bucket.pop(param_id)


def _async_grad_hook(grad, param_id):
    grad.add_(pop_async_grad(param_id))
    return grad


def register_async_grad_hook(param):
    param.register_hook(partial(_async_grad_hook, param_id=id(param)))


def synchronize(params=list()):
    _async_grad_bucket.synchronize(params)
    torch.cuda.default_stream().synchronize()
    if len(_async_grad_bucket) > 0:
        raise RuntimeError(f"{len(_async_grad_bucket)} asynchronous gradient(s) not collected.")
updated tp layers 2022-10-26 12:54:39 +00:00			`from collections import OrderedDict`
			`from functools import partial`

			`import torch`
			`from torch import Tensor`

			`from colossalai.constants import INPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_GROUP_3D, OUTPUT_X_WEIGHT_3D, WEIGHT_GROUP_3D`
Migrated project 2021-10-28 16:21:23 +00:00			`from colossalai.core import global_context as gpc`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`from colossalai.global_variables import tensor_parallel_env as env`
Migrated project 2021-10-28 16:21:23 +00:00

			`def get_depth_from_env() -> int:`
			`try:`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`depth = env.depth_3d`
Migrated project 2021-10-28 16:21:23 +00:00			`assert depth > 0, 'DEPTH must be greater than zero'`
			`return depth`

			`except KeyError as e:`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`raise EnvironmentError('DEPTH is not found in the current environment, '`
			`'please make sure that you have used the correct process group initializer')`
Migrated project 2021-10-28 16:21:23 +00:00

Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-09 07:08:29 +00:00			`def get_parallel_mode_from_env(group):`
updated tp layers 2022-10-26 12:54:39 +00:00			`assert group in [INPUT_GROUP_3D, WEIGHT_GROUP_3D, OUTPUT_GROUP_3D, INPUT_X_WEIGHT_3D, OUTPUT_X_WEIGHT_3D], \`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`f'{group} is not valid for 3D tensor parallelism.'`
			`return getattr(env, group)`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-09 07:08:29 +00:00

			`def swap_in_out_group():`
moved env variables to global variables; (#215) added branch context; added vocab parallel layers; moved split_batch from load_batch to tensor parallel embedding layers; updated gpt model; updated unit test cases; fixed few collective communicator bugs 2022-02-14 03:15:02 +00:00			`env.input_group_3d, env.output_group_3d = env.output_group_3d, env.input_group_3d`
updated tp layers 2022-10-26 12:54:39 +00:00			`env.input_x_weight_group_3d, env.output_x_weight_group_3d = (`
			`env.output_x_weight_group_3d,`
			`env.input_x_weight_group_3d,`
			`)`
Develop/experiments (#59) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> * Split conv2d, class token, positional embedding in 2d, Fix random number in ddp Fix convergence in cifar10, Imagenet1000 * Integrate 1d tensor parallel in Colossal-AI (#39) * fixed 1D and 2D convergence (#38) * optimized 2D operations * fixed 1D ViT convergence problem * Feature/ddp (#49) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * support torch ddp * fix loss accumulation * add log for ddp * change seed * modify timing hook Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * Feature/pipeline (#40) * remove redundancy func in setup (#19) (#20) * use env to control the language of doc (#24) (#25) * Support TP-compatible Torch AMP and Update trainer API (#27) * Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> * add an example of ViT-B/16 and remove w_norm clipping in LAMB (#29) * add explanation for ViT example (#35) (#36) * optimize communication of pipeline parallel * fix grad clip for pipeline Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> * optimized 3d layer to fix slow computation ; tested imagenet performance with 3d; reworked lr_scheduler config definition; fixed launch args; fixed some printing issues; simplified apis of 3d layers (#51) * Update 2.5d layer code to get a similar accuracy on imagenet-1k dataset * update api for better usability (#58) update api for better usability Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: puck_WCR <46049915+WANG-CR@users.noreply.github.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com> Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-09 07:08:29 +00:00

Migrated project 2021-10-28 16:21:23 +00:00			`def dbg_check_shape(tensor: Tensor, shape: tuple):`
			`rank = gpc.get_global_rank()`
			`if rank == 0:`
			`print(tensor.shape)`
			`assert tensor.shape == shape, \`
			`'{} does not match {}'.format(tensor.shape, shape)`
updated tp layers 2022-10-26 12:54:39 +00:00

			`class AsyncGradientBucket(object):`

			`def __init__(self):`
			`self.bucket = OrderedDict()`

			`def __len__(self):`
			`return len(self.bucket)`

			`def push(self, async_op, grad_tensor, param_id):`
			`self.bucket[param_id] = tuple((async_op, grad_tensor))`
			`return torch.zeros_like(grad_tensor, dtype=grad_tensor.dtype, device=grad_tensor.device)`

			`def pop(self, param_id):`
			`grad = None`
			`if param_id in self.bucket:`
			`op, grad = self.bucket.pop(param_id)`
			`if op is not None:`
			`op.wait()`
			`return grad`

			`def synchronize(self, params):`
			`for p in params:`
			`i = id(p)`
			`if i in self.bucket:`
			`op, grad = self.bucket.pop(i)`
			`if op is not None:`
			`op.wait()`
			`p.grad.add_(grad)`


			`_async_grad_bucket = AsyncGradientBucket()`


			`def push_async_grad(op, grad, param_id):`
			`return _async_grad_bucket.push(op, grad, param_id)`


			`def pop_async_grad(param_id):`
			`return _async_grad_bucket.pop(param_id)`


			`def _async_grad_hook(grad, param_id):`
			`grad.add_(pop_async_grad(param_id))`
			`return grad`


			`def register_async_grad_hook(param):`
			`param.register_hook(partial(_async_grad_hook, param_id=id(param)))`


			`def synchronize(params=list()):`
			`_async_grad_bucket.synchronize(params)`
			`torch.cuda.default_stream().synchronize()`
			`if len(_async_grad_bucket) > 0:`
			`raise RuntimeError(f"{len(_async_grad_bucket)} asynchronous gradient(s) not collected.")`