mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
152 lines
5.5 KiB
152 lines
5.5 KiB
3 years ago
|
import torch
|
||
|
import torch.distributed as dist
|
||
|
from .parallel_mode import ParallelMode
|
||
|
|
||
|
|
||
|
def _check_sanity():
|
||
|
from colossalai.core import global_context as gpc
|
||
|
if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
|
||
|
raise NotImplementedError("Moe is not compatible with tensor or "
|
||
|
"pipeline parallel at present.")
|
||
|
|
||
|
|
||
|
class MoeInfo:
|
||
|
"""Moe parallelism information, storing parallel sizes and groups.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, ep_size: int, dp_size: int):
|
||
|
_check_sanity()
|
||
|
self.ep_size = ep_size
|
||
|
self.dp_size = dp_size
|
||
|
self.ep_group = None
|
||
|
# data parallel group for experts, since ep_group is different
|
||
|
# we may have different dp_group from get_group(ParallelMode.DATA)
|
||
|
self.dp_group = None
|
||
|
|
||
|
# Here we assume tensor parallel size = 1
|
||
|
# Otherwise, MoE can't be used
|
||
|
# Since TENSOR parallel group and DATA parallel group
|
||
|
# have been created, we can use them directly.
|
||
|
if ep_size == 1:
|
||
|
from colossalai.core import global_context as gpc
|
||
|
self.ep_group = gpc.get_group(ParallelMode.TENSOR)
|
||
|
self.dp_group = gpc.get_group(ParallelMode.DATA)
|
||
|
return
|
||
|
|
||
|
if dp_size == 1:
|
||
|
from colossalai.core import global_context as gpc
|
||
|
self.ep_group = gpc.get_group(ParallelMode.DATA)
|
||
|
self.dp_group = gpc.get_group(ParallelMode.TENSOR)
|
||
|
return
|
||
|
|
||
|
rank = dist.get_rank()
|
||
|
# Create expert parallel group
|
||
|
for i in range(dp_size):
|
||
|
ranks = [i * ep_size + j for j in range(ep_size)]
|
||
|
group = dist.new_group(ranks)
|
||
|
if rank in ranks:
|
||
|
self.ep_group = group
|
||
|
|
||
|
# Create data parallel group
|
||
|
for j in range(ep_size):
|
||
|
ranks = [i * ep_size + j for i in range(dp_size)]
|
||
|
group = dist.new_group(ranks)
|
||
|
if rank in ranks:
|
||
|
self.dp_group = group
|
||
|
|
||
|
|
||
|
class MoeContext:
|
||
|
"""MoE parallel context manager. This class manages different
|
||
|
parallel groups in MoE context and MoE loss in training.
|
||
|
"""
|
||
|
__instance = None
|
||
|
|
||
|
@staticmethod
|
||
|
def get_instance():
|
||
|
if MoeContext.__instance is None:
|
||
|
MoeContext.__instance = MoeContext()
|
||
|
return MoeContext.__instance
|
||
|
|
||
|
def __init__(self):
|
||
|
self.world_size = 1
|
||
|
# Users may want to set maximum expert parallel size smaller than the world size
|
||
|
# since very low bandwidth across nodes may constrain the performance of MoE
|
||
|
# When we have a maximum expert parallel size, we have a minimum data parallel size naturally
|
||
|
self.max_ep_size = 1
|
||
|
self.min_dp_size = 1
|
||
|
self.aux_loss = None
|
||
|
self.use_kernel_optim = True
|
||
|
|
||
|
self.has_setup = False
|
||
|
self._info_dict = dict()
|
||
|
|
||
|
@property
|
||
|
def information(self):
|
||
|
return self._info_dict
|
||
|
|
||
|
@property
|
||
|
def is_initialized(self):
|
||
|
return self.has_setup
|
||
|
|
||
|
def setup(self, seed: int, use_kernel_optim: bool = True):
|
||
|
|
||
|
assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
|
||
|
_check_sanity()
|
||
|
assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
|
||
|
|
||
|
self.world_size = dist.get_world_size()
|
||
|
|
||
|
from colossalai.core import global_context as gpc
|
||
|
self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
|
||
|
assert self.world_size % self.max_ep_size == 0, \
|
||
|
"Maximum epxert parallel size must be a factor of the number of GPUs"
|
||
|
self.min_dp_size = self.world_size // self.max_ep_size
|
||
|
|
||
|
# Enabling kernel optimization may raise error in some cases
|
||
|
# Users can close kernel optimization manually
|
||
|
self.use_kernel_optim = use_kernel_optim
|
||
|
|
||
|
from .random import moe_set_seed
|
||
|
moe_set_seed(seed)
|
||
|
self.has_setup = True
|
||
|
|
||
|
def get_info(self, num_experts: int):
|
||
|
"""Automatically deploys experts and returns parallel infomation about
|
||
|
distributed communication groups.
|
||
|
"""
|
||
|
|
||
|
gt_flag = num_experts % self.max_ep_size == 0 # check whether num_experts is greater
|
||
|
lt_flag = self.max_ep_size % num_experts == 0 # check whether num_experts is less
|
||
|
|
||
|
assert gt_flag or lt_flag, "Automatic experts placement do not support such situation right now."
|
||
|
|
||
|
# If the number of experts is greater than maximum expert parallel size,
|
||
|
# there are multiple experts in each GPU and each GPU has different experts
|
||
|
# So it's data parallel size is 1
|
||
|
# Otherwise, there is only one expert in each GPU
|
||
|
# The data parallel size should be calculated
|
||
|
dp_size = 1 if gt_flag else self.max_ep_size // num_experts
|
||
|
ep_size = self.max_ep_size // dp_size
|
||
|
|
||
|
# Calculate the number of experts for each GPU
|
||
|
num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
|
||
|
|
||
|
# Don't forget to multiply minimum data parallel size
|
||
|
dp_size *= self.min_dp_size
|
||
|
if not (ep_size in self.information):
|
||
|
self.information[ep_size] = MoeInfo(ep_size, dp_size)
|
||
|
|
||
|
return num_local_experts, self.information[ep_size]
|
||
|
|
||
|
def set_kernel_not_use(self):
|
||
|
self.use_kernel_optim = False
|
||
|
|
||
|
def reset_loss(self):
|
||
|
self.aux_loss = 0
|
||
|
|
||
|
def add_loss(self, loss):
|
||
|
self.aux_loss += loss
|
||
|
|
||
|
def get_loss(self):
|
||
|
return self.aux_loss
|