From 8432dc70802951227aee31da28b14f356aa3fe4c Mon Sep 17 00:00:00 2001 From: ver217 Date: Fri, 1 Apr 2022 16:15:36 +0800 Subject: [PATCH] polish moe docsrting (#618) --- colossalai/nn/layer/moe/layers.py | 13 ++++++++++--- colossalai/nn/layer/moe/utils.py | 6 +++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py index 04b4f1a58..aaa261b23 100644 --- a/colossalai/nn/layer/moe/layers.py +++ b/colossalai/nn/layer/moe/layers.py @@ -320,15 +320,22 @@ class MoeModule(nn.Module): capacity_factor_eval (float, optional): Capacity factor in routing during evaluation min_capacity (int, optional): The minimum number of the capacity of each expert noisy_policy (str, optional): The policy of noisy function. Now we have 'Jitter' and 'Gaussian'. - 'Jitter' can be found in Switch Transformer paper (https://arxiv.org/abs/2101.03961). - 'Gaussian' can be found in ViT-MoE paper (https://arxiv.org/abs/2106.05974). + 'Jitter' can be found in `Switch Transformer paper`_. + 'Gaussian' can be found in `ViT-MoE paper`_. drop_tks (bool, optional): Whether drops tokens in evaluation use_residual (bool, optional): Makes this MoE layer a Residual MoE. - More information can be found in Microsoft paper (https://arxiv.org/abs/2201.05596). + More information can be found in `Microsoft paper`_. residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given expert_args (optional): The args of expert when no instance is given + + .. _Switch Transformer paper: + https://arxiv.org/abs/2101.03961 + .. _ViT-MoE paper: + https://arxiv.org/abs/2106.05974 + .. _Microsoft paper: + https://arxiv.org/abs/2201.05596 """ def __init__(self, diff --git a/colossalai/nn/layer/moe/utils.py b/colossalai/nn/layer/moe/utils.py index a13c8184e..3a1258bd1 100644 --- a/colossalai/nn/layer/moe/utils.py +++ b/colossalai/nn/layer/moe/utils.py @@ -14,8 +14,8 @@ class ForceFP32Parameter(torch.nn.Parameter): class NormalNoiseGenerator: """Generates a random noisy mask for logtis tensor. - All noise is generated from a normal distribution (0, 1 / E^2), where - E = the number of experts. + All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where + `E = the number of experts`. Args: num_experts (int): The number of experts. @@ -34,7 +34,7 @@ class NormalNoiseGenerator: class UniformNoiseGenerator: """Generates a random noisy mask for logtis tensor. copied from mesh tensorflow: - Multiply values by a random number between 1-epsilon and 1+epsilon. + Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`. Makes models more resilient to rounding errors introduced by bfloat16. This seems particularly important for logits.