ColossalAI/colossalai/engine/gradient_handler/_sequence_parallel_gradient...

from colossalai.core import global_context as gpc
from colossalai.registry import GRADIENT_HANDLER
from ._base_gradient_handler import BaseGradientHandler
from ...context.parallel_mode import ParallelMode
from .utils import bucket_allreduce


@GRADIENT_HANDLER.register_module
class SequenceParallelGradientHandler(BaseGradientHandler):
    """A helper class to handle all-reduce operations in a data parallel group.
    A all-reduce collective communication will be operated in 
    :func:`handle_gradient` among a data parallel group.
    For better performance, it bucketizes the gradients of all parameters that are 
    the same type to improve the efficiency of communication.

    Args:
        model (Module): Model where the gradients accumulate.
        optimizer (Optimizer): Optimizer for updating the parameters.
    """

    def handle_gradient(self):
        """A method running a all-reduce operation in a data parallel group.
        """
        if gpc.get_world_size(ParallelMode.SEQUENCE_DP) > 1:
            bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.SEQUENCE_DP))
adapted for sequence parallel (#163) 3 years ago			`from colossalai.core import global_context as gpc`
			`from colossalai.registry import GRADIENT_HANDLER`
			`from ._base_gradient_handler import BaseGradientHandler`
			`from ...context.parallel_mode import ParallelMode`
add moe context, moe utilities and refactor gradient handler (#455) 3 years ago			`from .utils import bucket_allreduce`
adapted for sequence parallel (#163) 3 years ago

			`@GRADIENT_HANDLER.register_module`
			`class SequenceParallelGradientHandler(BaseGradientHandler):`
			`"""A helper class to handle all-reduce operations in a data parallel group.`
			`A all-reduce collective communication will be operated in`
			:func:`handle_gradient` among a data parallel group.
			`For better performance, it bucketizes the gradients of all parameters that are`
			`the same type to improve the efficiency of communication.`
[doc] improved docstring and assertion messages for the engine module (#871) 3 years ago
			`Args:`
			`model (Module): Model where the gradients accumulate.`
			`optimizer (Optimizer): Optimizer for updating the parameters.`
adapted for sequence parallel (#163) 3 years ago			`"""`

			`def handle_gradient(self):`
			`"""A method running a all-reduce operation in a data parallel group.`
			`"""`
add moe context, moe utilities and refactor gradient handler (#455) 3 years ago			`if gpc.get_world_size(ParallelMode.SEQUENCE_DP) > 1:`
			`bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.SEQUENCE_DP))`