From 8432dc70802951227aee31da28b14f356aa3fe4c Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 1 Apr 2022 16:15:36 +0800
Subject: [PATCH] polish moe docsrting (#618)

---
 colossalai/nn/layer/moe/layers.py | 13 ++++++++++---
 colossalai/nn/layer/moe/utils.py  |  6 +++---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/colossalai/nn/layer/moe/layers.py b/colossalai/nn/layer/moe/layers.py
index 04b4f1a58..aaa261b23 100644
--- a/colossalai/nn/layer/moe/layers.py
+++ b/colossalai/nn/layer/moe/layers.py
@@ -320,15 +320,22 @@ class MoeModule(nn.Module):
         capacity_factor_eval (float, optional): Capacity factor in routing during evaluation
         min_capacity (int, optional): The minimum number of the capacity of each expert
         noisy_policy (str, optional): The policy of noisy function. Now we have 'Jitter' and 'Gaussian'.
-            'Jitter' can be found in Switch Transformer paper (https://arxiv.org/abs/2101.03961).
-            'Gaussian' can be found in ViT-MoE paper (https://arxiv.org/abs/2106.05974).
+            'Jitter' can be found in `Switch Transformer paper`_.
+            'Gaussian' can be found in `ViT-MoE paper`_.
         drop_tks (bool, optional): Whether drops tokens in evaluation
         use_residual (bool, optional): Makes this MoE layer a Residual MoE.
-            More information can be found in Microsoft paper (https://arxiv.org/abs/2201.05596).
+            More information can be found in `Microsoft paper`_.
         residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE
         expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer
         expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given
         expert_args (optional): The args of expert when no instance is given
+
+    .. _Switch Transformer paper:
+        https://arxiv.org/abs/2101.03961
+    .. _ViT-MoE paper:
+        https://arxiv.org/abs/2106.05974
+    .. _Microsoft paper:
+        https://arxiv.org/abs/2201.05596
     """
 
     def __init__(self,
diff --git a/colossalai/nn/layer/moe/utils.py b/colossalai/nn/layer/moe/utils.py
index a13c8184e..3a1258bd1 100644
--- a/colossalai/nn/layer/moe/utils.py
+++ b/colossalai/nn/layer/moe/utils.py
@@ -14,8 +14,8 @@ class ForceFP32Parameter(torch.nn.Parameter):
 class NormalNoiseGenerator:
     """Generates a random noisy mask for logtis tensor.
 
-    All noise is generated from a normal distribution (0, 1 / E^2), where
-    E = the number of experts.
+    All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where
+    `E = the number of experts`.
 
     Args:
         num_experts (int): The number of experts.
@@ -34,7 +34,7 @@ class NormalNoiseGenerator:
 class UniformNoiseGenerator:
     """Generates a random noisy mask for logtis tensor.
     copied from mesh tensorflow:
-    Multiply values by a random number between 1-epsilon and 1+epsilon.
+    Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`.
     Makes models more resilient to rounding errors introduced by bfloat16.
     This seems particularly important for logits.