mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
467 lines
20 KiB
467 lines
20 KiB
import math
|
|
from abc import ABC
|
|
from typing import Callable, Optional, Tuple
|
|
|
|
import torch
|
|
import torch.distributed as dist
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
from torch.distributed import ProcessGroup
|
|
|
|
from colossalai.accelerator import get_accelerator
|
|
from colossalai.moe._operation import moe_cumsum
|
|
from colossalai.moe.manager import MOE_MANAGER
|
|
|
|
|
|
class MoeRouter(nn.Module, ABC):
|
|
"""Base class for all MoE routers.
|
|
Args:
|
|
k_value (int): The value of top_k.
|
|
capacity_factor_train (float): Capacity factor in routing of training.
|
|
capacity_factor_eval (float): Capacity factor in routing of evaluation.
|
|
min_capacity (int): The minimum number of the capacity of each expert.
|
|
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
|
|
drop_tks (bool, optional): Whether drops tokens in evaluation
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
k_value: int,
|
|
capacity_factor_train: float,
|
|
capacity_factor_eval: float,
|
|
min_capacity: int,
|
|
noisy_func: Optional[Callable] = None,
|
|
drop_tks: bool = True,
|
|
use_kernel: bool = False,
|
|
):
|
|
super().__init__()
|
|
self.k_value = k_value
|
|
self.capacity_factor_train = capacity_factor_train
|
|
self.capacity_factor_eval = capacity_factor_eval
|
|
self.min_capacity = min_capacity
|
|
self.noisy_func = noisy_func
|
|
self.drop_tks = drop_tks
|
|
self._aux_loss = None
|
|
self._z_loss = None
|
|
self.use_kernel = use_kernel
|
|
|
|
def get_capacity(self, num_tokens, num_experts, ep_group=None):
|
|
if ep_group is not None:
|
|
num_tokens_tensor = torch.tensor(num_tokens, device=get_current_device())
|
|
dist.all_reduce(num_tokens_tensor, group=ep_group)
|
|
num_tokens = num_tokens_tensor.item() // dist.get_world_size(ep_group)
|
|
capacity_factor = self.capacity_factor_train if self.training else self.capacity_factor_eval
|
|
capacity = math.floor(self.k_value * capacity_factor * num_tokens / num_experts)
|
|
capacity += capacity % 2
|
|
capacity = max(capacity, self.min_capacity)
|
|
assert capacity > 0
|
|
return int(capacity)
|
|
|
|
def set_aux_loss(self, router_probs: torch.Tensor, expert_indices: torch.Tensor, num_experts: int) -> None:
|
|
"""Computes auxiliary load balancing loss as in Switch Transformer.
|
|
|
|
See Switch Transformer (https://arxiv.org/abs/2101.03961). This function
|
|
implements the loss function presented in equations (4) - (6). It aims to
|
|
penalize those cases where the routing between experts is unbalanced.
|
|
|
|
Args:
|
|
router_probs: Probability assigned to each expert per token. Shape:
|
|
<float32>[num_groups, tokens_per_group, num_experts].
|
|
expert_indices: <int>[num_groups, tokens_per_group, num_selected_experts]
|
|
indices identifying the top num_selected_experts for a given token.
|
|
"""
|
|
assert self._aux_loss is None
|
|
if router_probs.dim() == expert_indices.dim() == 2:
|
|
router_probs = router_probs.unsqueeze(0)
|
|
expert_indices = expert_indices.unsqueeze(0)
|
|
assert (
|
|
router_probs.dim() == expert_indices.dim() == 3
|
|
), "router_probs must be 3D tensor and expert_indices must be 4D tensor"
|
|
|
|
# Shape: [num_groups, tokens_per_group, num_selected_experts, num_experts].
|
|
expert_mask = F.one_hot(expert_indices, num_experts)
|
|
# For a given token, determine if it was routed to a given expert.
|
|
# Shape: [num_groups, tokens_per_group, num_experts]
|
|
expert_mask = expert_mask.max(dim=-2)[0]
|
|
|
|
tokens_per_group_and_expert = torch.mean(expert_mask.float(), dim=-2)
|
|
router_prob_per_group_and_expert = torch.mean(router_probs.float(), dim=-2)
|
|
aux_loss = num_experts**2 * torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert)
|
|
self._aux_loss = aux_loss
|
|
|
|
def set_z_loss(self, router_logits: torch.Tensor):
|
|
"""Compute router z-loss.
|
|
|
|
The router z-loss was introduced in Designing Effective Sparse Expert Models
|
|
(https://arxiv.org/abs/2202.08906). It encourages router logits to remain
|
|
small in an effort to improve stability.
|
|
|
|
Args:
|
|
router_logits: <float>[num_groups, tokens_per_group, num_experts] router logits.
|
|
"""
|
|
assert self._z_loss is None
|
|
if router_logits.dim() == 2:
|
|
router_logits = router_logits.unsqueeze(0)
|
|
assert router_logits.dim() == 3, "router_logits must be 3D tensor"
|
|
num_groups, tokens_per_group, _ = router_logits.shape
|
|
log_z = torch.logsumexp(router_logits, dim=-1)
|
|
z_loss = torch.sum(log_z**2, dtype=torch.float32) / (num_groups * tokens_per_group)
|
|
self._z_loss = z_loss
|
|
|
|
def pop_router_loss(self) -> torch.Tensor:
|
|
assert self._aux_loss is not None
|
|
MOE_MANAGER.add_loss(self._aux_loss, self._z_loss)
|
|
self._aux_loss = None
|
|
self._z_loss = None
|
|
|
|
|
|
class Top1Router(MoeRouter):
|
|
"""Top1 router that returns the dispatch mask (batch_size * seq_len, num_experts, capacity)
|
|
and combine weight (batch_size * seq_len, num_experts, capacity) for routing usage. More detailed
|
|
function can be found in the paper about Switch Transformer of Google.
|
|
|
|
Args:
|
|
capacity_factor_train (float, optional): Capacity factor in routing of training.
|
|
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
|
|
min_capacity (int, optional): The minimum number of the capacity of each expert.
|
|
select_policy (str, optional): The policy about tokens selection.
|
|
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
|
|
drop_tks (bool, optional): Whether drops tokens in evaluation
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
capacity_factor_train: float = 1.25,
|
|
capacity_factor_eval: float = 2.0,
|
|
min_capacity: int = 4,
|
|
select_policy: str = "first",
|
|
noisy_func: Optional[Callable] = None,
|
|
drop_tks: bool = True,
|
|
):
|
|
super().__init__(
|
|
k_value=1,
|
|
capacity_factor_train=capacity_factor_train,
|
|
capacity_factor_eval=capacity_factor_eval,
|
|
min_capacity=min_capacity,
|
|
noisy_func=noisy_func,
|
|
drop_tks=drop_tks,
|
|
)
|
|
self.select_policy = select_policy
|
|
assert select_policy in {"first", "random"}
|
|
if select_policy == "random":
|
|
self.uniform = torch.distributions.uniform.Uniform(
|
|
low=torch.tensor(0.0, device=get_accelerator().get_current_device()),
|
|
high=torch.tensor(1.0, device=get_accelerator().get_current_device()),
|
|
).rsample
|
|
|
|
def forward(
|
|
self,
|
|
inputs: torch.Tensor,
|
|
use_kernel: bool = False,
|
|
ep_group: Optional[ProcessGroup] = None,
|
|
use_loss: bool = False,
|
|
use_norm: bool = False,
|
|
) -> Tuple:
|
|
"""
|
|
Args:
|
|
inputs (torch.Tensor): The input tensor of shape (batch_size * seq_len, num_experts).
|
|
|
|
Returns:
|
|
1. use_kernel is False:
|
|
The combine weight tensor of shape (batch_size * seq_len, num_experts, capacity).
|
|
The dispatch mask tensor of shape (batch_size * seq_len, num_experts, capacity).
|
|
2. use_kernel is True:
|
|
...
|
|
"""
|
|
if self.noisy_func is not None and self.training:
|
|
inputs = self.noisy_func(inputs)
|
|
|
|
assert inputs.dtype == torch.float
|
|
probs = F.softmax(inputs, dim=-1)
|
|
num_experts = probs.size(-1)
|
|
num_tokens = inputs.size(0)
|
|
capacity = self.get_capacity(num_tokens, num_experts, ep_group)
|
|
|
|
top1_idx = torch.argmax(inputs, dim=-1)
|
|
mask = F.one_hot(top1_idx, num_classes=num_experts).to(torch.int32)
|
|
|
|
# calculate router loss
|
|
self.set_aux_loss(probs, top1_idx.unsqueeze(-1), num_experts)
|
|
self.set_z_loss(inputs)
|
|
self.pop_router_loss()
|
|
|
|
if not self.training and not self.drop_tks and ep_group is not None:
|
|
max_num = torch.max(torch.sum(mask, dim=0))
|
|
dist.all_reduce(max_num, op=dist.ReduceOp.MAX, group=ep_group)
|
|
capacity = max_num.item()
|
|
|
|
if self.select_policy == "random":
|
|
rand_mask = mask * self.uniform(mask.shape)
|
|
_, dispatch_idx = torch.topk(rand_mask, k=capacity, dim=0)
|
|
mask = mask * torch.zeros_like(mask).scatter_(0, dispatch_idx, 1)
|
|
ranks = moe_cumsum(mask, use_kernel=self.use_kernel)
|
|
elif self.select_policy == "first":
|
|
ranks = moe_cumsum(mask, use_kernel=self.use_kernel)
|
|
mask = mask * torch.lt(ranks, capacity)
|
|
else:
|
|
raise NotImplementedError("Not support such select policy yet.")
|
|
|
|
ranks = torch.sum(mask * ranks, dim=-1)
|
|
used_capacity = mask.sum(dim=0)
|
|
|
|
if use_kernel:
|
|
mask = torch.sum(mask, dim=-1)
|
|
mask = torch.stack([mask], dim=0).to(torch.int32)
|
|
dest_idx = torch.stack([top1_idx * capacity + ranks], dim=0).to(torch.int32)
|
|
return used_capacity, probs, mask, dest_idx, num_experts * capacity
|
|
else:
|
|
ranks = F.one_hot(ranks, num_classes=capacity)
|
|
weight = mask * probs.type_as(inputs)
|
|
combine_weights = weight.unsqueeze(2) * ranks.unsqueeze(1)
|
|
sec_mask = combine_weights.bool()
|
|
return used_capacity, combine_weights, sec_mask, probs
|
|
|
|
|
|
class Top2Router(MoeRouter):
|
|
"""Top2 router that returns the dispatch mask (batch_size * seq_len, num_experts, capacity)
|
|
and combine weight (batch_size * seq_len, num_experts, capacity) for routing usage. More detailed
|
|
function can be found in the paper about ViT-MoE.
|
|
|
|
Args:
|
|
capacity_factor_train (float, optional): Capacity factor in routing of training.
|
|
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
|
|
min_capacity (int, optional): The minimum number of the capacity of each expert
|
|
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
|
|
drop_tks (bool, optional): Whether drops tokens in evaluation.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
capacity_factor_train: float = 1.25,
|
|
capacity_factor_eval: float = 2.0,
|
|
min_capacity: int = 4,
|
|
noisy_func: Optional[Callable] = None,
|
|
drop_tks: bool = True,
|
|
):
|
|
super().__init__(
|
|
k_value=2,
|
|
capacity_factor_train=capacity_factor_train,
|
|
capacity_factor_eval=capacity_factor_eval,
|
|
min_capacity=min_capacity,
|
|
noisy_func=noisy_func,
|
|
drop_tks=drop_tks,
|
|
)
|
|
|
|
def forward(
|
|
self,
|
|
inputs: torch.Tensor,
|
|
use_kernel: bool = False,
|
|
ep_group: Optional[ProcessGroup] = None,
|
|
use_norm: bool = False,
|
|
use_loss: bool = True,
|
|
) -> Tuple:
|
|
"""
|
|
Args:
|
|
inputs (torch.Tensor): The input tensor of shape (batch_size * seq_len, num_experts).
|
|
|
|
Returns:
|
|
1. use_kernel is False:
|
|
The combine weight tensor of shape (batch_size * seq_len, num_experts, capacity).
|
|
The dispatch mask tensor of shape (batch_size * seq_len, num_experts, capacity).
|
|
2. use_kernel is True:
|
|
...
|
|
"""
|
|
if self.noisy_func is not None and self.training:
|
|
inputs = self.noisy_func(inputs)
|
|
|
|
assert inputs.dtype == torch.float
|
|
probs = F.softmax(inputs, dim=-1)
|
|
if use_norm:
|
|
routing_weights, _ = torch.topk(probs, 2, dim=-1)
|
|
probs = probs / routing_weights.sum(dim=-1, keepdim=True)
|
|
|
|
num_experts = probs.size(-1)
|
|
num_tokens = inputs.size(0)
|
|
capacity = self.get_capacity(num_tokens, num_experts, ep_group)
|
|
|
|
top1_idx = torch.argmax(probs, dim=-1)
|
|
mask1 = F.one_hot(top1_idx, num_classes=num_experts).to(torch.int32)
|
|
logits_except1 = probs.masked_fill(mask1.bool(), float("-inf"))
|
|
top2_idx = torch.argmax(logits_except1, dim=-1)
|
|
mask2 = F.one_hot(top2_idx, num_classes=num_experts).to(torch.int32)
|
|
|
|
cmask = mask1 + mask2 # loss: [s, e]
|
|
cmask = cmask.float() / 2.0 # div 2 to normalize it to 1
|
|
|
|
# calculate loss
|
|
if use_loss:
|
|
expert_indices = torch.stack([top1_idx, top2_idx], dim=-1)
|
|
self.set_aux_loss(probs, expert_indices, num_experts)
|
|
self.set_z_loss(inputs)
|
|
self.pop_router_loss()
|
|
|
|
if not self.training and not self.drop_tks and ep_group is not None:
|
|
max_num = torch.max(torch.sum(cmask, dim=0))
|
|
dist.all_reduce(max_num, op=dist.ReduceOp.MAX, group=ep_group)
|
|
capacity = max_num.item()
|
|
|
|
rank1 = moe_cumsum(mask1, use_kernel=self.use_kernel) # rank1: [s, e]
|
|
rank2 = moe_cumsum(mask2, use_kernel=self.use_kernel)
|
|
rank2 += torch.sum(mask1, dim=-2, keepdim=True)
|
|
|
|
mask1 *= torch.lt(rank1, capacity)
|
|
mask2 *= torch.lt(rank2, capacity)
|
|
used_capacity = mask1.sum(dim=0) + mask2.sum(dim=0)
|
|
|
|
rank1 = torch.sum(mask1 * rank1, dim=-1)
|
|
rank2 = torch.sum(mask2 * rank2, dim=-1)
|
|
|
|
if use_kernel:
|
|
mask1 = torch.sum(mask1, dim=-1)
|
|
mask2 = torch.sum(mask2, dim=-1)
|
|
|
|
mask = torch.stack([mask1, mask2], dim=0).to(torch.int32)
|
|
dest_idx = torch.stack([top1_idx * capacity + rank1, top2_idx * capacity + rank2], dim=0).to(torch.int32)
|
|
|
|
return used_capacity, probs, mask, dest_idx, num_experts * capacity
|
|
else:
|
|
"""
|
|
The following code is equivalent to:
|
|
|
|
```
|
|
weight1 = mask1 * probs.type_as(inputs)
|
|
weight2 = mask2 * probs.type_as(inputs)
|
|
rank1_sc = F.one_hot(rank1, num_classes=capacity)
|
|
rank2_sc = F.one_hot(rank2, num_classes=capacity)
|
|
|
|
cb_weight1 = weight1.unsqueeze(2) * rank1_sc.unsqueeze(1)
|
|
cb_weight2 = weight2.unsqueeze(2) * rank2_sc.unsqueeze(1)
|
|
cb_weight = cb_weight1 + cb_weight2
|
|
sec_mask = cb_weight.bool()
|
|
```
|
|
"""
|
|
|
|
weight1 = mask1 * probs.type_as(inputs)
|
|
weight2 = mask2 * probs.type_as(inputs)
|
|
|
|
cb_weight = torch.zeros(inputs.shape + (capacity,), device=inputs.device)
|
|
sec_mask = torch.zeros_like(cb_weight, dtype=torch.bool)
|
|
indices = torch.arange(0, inputs.shape[0], device=inputs.device)
|
|
cb_weight[indices, top1_idx[indices], rank1[indices]] += weight1[indices, top1_idx[indices]]
|
|
cb_weight[indices, top2_idx[indices], rank2[indices]] += weight2[indices, top2_idx[indices]]
|
|
sec_mask[indices, top1_idx[indices], rank1[indices]] |= mask1.bool()[indices, top1_idx[indices]]
|
|
sec_mask[indices, top2_idx[indices], rank2[indices]] |= mask2.bool()[indices, top2_idx[indices]]
|
|
|
|
return used_capacity, cb_weight, sec_mask
|
|
|
|
|
|
class TopKRouter(MoeRouter):
|
|
"""Masked matmul router using tokens choose top-k experts assignment.
|
|
|
|
NOTE: this is modified from flaxformer.
|
|
This router uses the same mechanism as in Switch Transformer
|
|
(https://arxiv.org/abs/2101.03961) and V-MoE
|
|
(https://arxiv.org/abs/2106.05974): tokens choose their top experts. Items are
|
|
sorted by router_probs and then routed to their choice of expert until the
|
|
expert's expert_capacity is reached. There is no guarantee that each token is
|
|
processed by an expert, or that each expert receives at least one token.
|
|
|
|
Attributes:
|
|
num_selected_experts: Maximum number of experts to which each token is
|
|
routed. Tokens may be routed to fewer experts if particular experts are
|
|
oversubscribed / reach capacity.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
num_selected_experts: int,
|
|
capacity_factor_train: float = 1.25,
|
|
capacity_factor_eval: float = 2.0,
|
|
min_capacity: int = 4,
|
|
noisy_func: Optional[Callable] = None,
|
|
drop_tks: bool = True,
|
|
):
|
|
super().__init__(
|
|
num_selected_experts, capacity_factor_train, capacity_factor_eval, min_capacity, noisy_func, drop_tks
|
|
)
|
|
|
|
def forward(
|
|
self,
|
|
router_probs: torch.Tensor,
|
|
expert_capacity: int,
|
|
) -> Tuple:
|
|
"""Computes masks for the top-k experts per token.
|
|
|
|
Args:
|
|
router_probs: <float32>[num_groups, tokens_per_group, num_experts]
|
|
probabilities used to determine the routing of tokens to the experts.
|
|
|
|
Returns:
|
|
Dispatch and combine arrays for routing with masked matmuls.
|
|
"""
|
|
# TODO: FIXME: add parallel group
|
|
num_groups, _, num_experts = router_probs.shape
|
|
|
|
# Top-k router probability and corresponding expert indices for each token.
|
|
# Shape: [num_groups, tokens_per_group, num_selected_experts].
|
|
expert_gate, expert_index = torch.topk(router_probs, self.k_value)
|
|
|
|
self.set_aux_loss(router_probs, expert_index, num_experts)
|
|
self.pop_router_loss()
|
|
|
|
# Make num_selected_experts the leading axis to ensure that top-1 choices
|
|
# have priority over top-2 choices, which have priority over top-3 choices,
|
|
# etc.
|
|
expert_index = torch.transpose(expert_index, 1, 2)
|
|
# Shape: [num_groups, num_selected_experts * tokens_per_group]
|
|
expert_index = expert_index.reshape(num_groups, -1)
|
|
|
|
# Create mask out of indices.
|
|
# Shape: [num_groups, tokens_per_group * num_selected_experts, num_experts].
|
|
expert_mask = F.one_hot(expert_index, num_experts).to(torch.int32)
|
|
|
|
# Experts have a fixed capacity that we cannot exceed. A token's priority
|
|
# within the expert's buffer is given by the masked, cumulative capacity of
|
|
# its target expert.
|
|
# Shape: [num_groups, tokens_per_group * num_selected_experts, num_experts].
|
|
token_priority = torch.cumsum(expert_mask, dim=1) * expert_mask - 1
|
|
# Shape: [num_groups, num_selected_experts, tokens_per_group, num_experts].
|
|
token_priority = token_priority.reshape((num_groups, self.k_value, -1, num_experts))
|
|
# Shape: [num_groups, tokens_per_group, num_selected_experts, num_experts].
|
|
token_priority = torch.transpose(token_priority, 1, 2)
|
|
# For each token, across all selected experts, select the only non-negative
|
|
# (unmasked) priority. Now, for group G routing to expert E, token T has
|
|
# non-negative priority (i.e. token_priority[G,T,E] >= 0) if and only if E
|
|
# is its targeted expert.
|
|
# Shape: [num_groups, tokens_per_group, num_experts].
|
|
token_priority = torch.max(token_priority, dim=2)[0]
|
|
|
|
# Token T can only be routed to expert E if its priority is positive and
|
|
# less than the expert capacity. One-hot matrix will ignore indices outside
|
|
# the range [0, expert_capacity).
|
|
# Shape: [num_groups, tokens_per_group, num_experts, expert_capacity].
|
|
valid_mask = torch.logical_and(token_priority >= 0, token_priority < expert_capacity)
|
|
token_priority = torch.masked_fill(token_priority, ~valid_mask, 0)
|
|
dispatch_mask = F.one_hot(token_priority, expert_capacity).to(torch.bool)
|
|
valid_mask = valid_mask.unsqueeze(-1).expand(-1, -1, -1, expert_capacity)
|
|
dispatch_mask = torch.masked_fill(dispatch_mask, ~valid_mask, 0)
|
|
|
|
# The combine array will be used for combining expert outputs, scaled by the
|
|
# router probabilities. Shape: [num_groups, tokens_per_group, num_experts,
|
|
# expert_capacity].
|
|
combine_array = torch.einsum("...te,...tec->...tec", router_probs, dispatch_mask)
|
|
|
|
return combine_array, dispatch_mask
|
|
|
|
|
|
def get_router_cls(top_k: int, grouped: bool = False) -> MoeRouter:
|
|
if not grouped:
|
|
if top_k == 1:
|
|
return Top1Router
|
|
elif top_k == 2:
|
|
return Top2Router
|
|
else:
|
|
raise NotImplementedError("top_k > 2 is not supported yet")
|
|
else:
|
|
return TopKRouter
|