mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
52 lines
2.0 KiB
52 lines
2.0 KiB
3 years ago
|
import torch.nn as nn
|
||
|
import torch.distributed as dist
|
||
|
from colossalai.core import global_context as gpc, moe_context as moe_env
|
||
|
from colossalai.context import ParallelMode
|
||
|
from .common import is_using_ddp
|
||
|
from typing import Dict, List
|
||
|
|
||
|
|
||
|
def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]:
|
||
|
"""Returns a parameter dictionary, the key of which is the expert parallel
|
||
|
size of every parameter. Since the parameters in data parallelism is replicated
|
||
|
in each GPU, we set their ep_size to 1.
|
||
|
|
||
|
:param model: A pyTorch nn.model from which we get dict
|
||
|
:type model: torch.nn.Module
|
||
|
"""
|
||
|
epsize_param_dict = dict()
|
||
|
for param in model.parameters():
|
||
|
if not hasattr(param, 'moe_info'):
|
||
|
ep_size = 1 # set ep_size to 1 for dp parameters
|
||
|
else:
|
||
|
ep_size = param.moe_info.ep_size
|
||
|
if ep_size not in epsize_param_dict:
|
||
|
epsize_param_dict[ep_size] = []
|
||
|
epsize_param_dict[ep_size].append(param)
|
||
|
|
||
|
return epsize_param_dict
|
||
|
|
||
|
|
||
|
def sync_moe_model_param(model: nn.Module):
|
||
|
"""Make sure model parameters are consistent in MoE parallel context
|
||
|
|
||
|
:param model: A pyTorch nn.model on whose parameters you check the consistency
|
||
|
:type model: torch.nn.Module
|
||
|
"""
|
||
|
if is_using_ddp():
|
||
|
|
||
|
param_dict = get_moe_epsize_param_dict(model)
|
||
|
|
||
|
# synchrosize the parameters whose dp_group is the whole world
|
||
|
if 1 in param_dict:
|
||
|
src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0]
|
||
|
for param in param_dict[1]:
|
||
|
dist.broadcast(param, src=src_rank, group=gpc.get_group(ParallelMode.DATA))
|
||
|
|
||
|
for ep_size in param_dict:
|
||
|
# When ep_size = world_size, communication is not needed
|
||
|
if ep_size != 1 and ep_size != moe_env.world_size:
|
||
|
src_rank = dist.get_rank(moe_env.information[ep_size].ep_group)
|
||
|
for param in param_dict[ep_size]:
|
||
|
dist.broadcast(param, src=src_rank, group=param.moe_info.dp_group)
|