From 9473a1b9c8388ef5854ffcf01017805d681f616b Mon Sep 17 00:00:00 2001
From: puck_WCR <46049915+WANG-CR@users.noreply.github.com>
Date: Tue, 18 Jan 2022 18:33:36 +0800
Subject: [PATCH] AMP docstring/markdown update (#160)

---
 colossalai/amp/apex_amp/__init__.py   |  2 +-
 colossalai/amp/naive_amp/__init__.py  |  2 +-
 colossalai/amp/torch_amp/torch_amp.py | 12 ++++++++++++
 docs/amp.md                           |  2 +-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/colossalai/amp/apex_amp/__init__.py b/colossalai/amp/apex_amp/__init__.py
index 16e7c36f3..23585ede7 100644
--- a/colossalai/amp/apex_amp/__init__.py
+++ b/colossalai/amp/apex_amp/__init__.py
@@ -6,7 +6,7 @@ from torch.optim import Optimizer
 def convert_to_apex_amp(model: nn.Module,
                         optimizer: Optimizer,
                         amp_config):
-    """A helper function to wrap training components with Torch AMP modules
+    """A helper function to wrap training components with Apex AMP modules
 
     :param model: your model object
     :type model: :class:`torch.nn.Module`
diff --git a/colossalai/amp/naive_amp/__init__.py b/colossalai/amp/naive_amp/__init__.py
index c050ee937..32ea3469a 100644
--- a/colossalai/amp/naive_amp/__init__.py
+++ b/colossalai/amp/naive_amp/__init__.py
@@ -8,7 +8,7 @@ from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel
 def convert_to_naive_amp(model: nn.Module,
                          optimizer: Optimizer,
                          amp_config):
-    """A helper function to wrap training components with Torch AMP modules
+    """A helper function to wrap training components with naive AMP modules
 
     :param model: your model object
     :type model: :class:`torch.nn.Module`
diff --git a/colossalai/amp/torch_amp/torch_amp.py b/colossalai/amp/torch_amp/torch_amp.py
index b323a25c5..c90895de7 100644
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
@@ -18,6 +18,17 @@ class TorchAMPOptimizer(ColossalaiOptimizer):
 
     :param optim: a normal optimizer like Adam or SGD
     :type optim: torch.optim.Optimizer
+    :param init_scale: Initial scale factor
+    :type init_scale: float, optional, default=2.**16
+    :param growth_factor: Factor by which the scale is multiplied during :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+    :type growth_factor: float, optional, default=2.0
+    :param backoff_factor: Factor by which the scale is multiplied during :meth:`update` if inf/NaN gradients occur in an iteration.
+    :type backoff_factor: float, optional, default=0.5
+    :param growth_interval: Number of consecutive iterations without inf/NaN gradients that must occur for the scale to be multiplied by ``growth_factor``.
+    :type growth_interval: int, optional, default=2000
+    :param enabled: If ``False``, disables gradient scaling. :meth:`step` simply invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+    :type enabled: bool, optional, default=True
+
     """
 
     def __init__(self, optim: Optimizer, *args, **kwargs):
@@ -68,6 +79,7 @@ class TorchAMPLoss(nn.Module):
     :param loss: a loss function object
     :type loss: torch.nn.modules.loss._Loss
     """
+
     def __init__(self, loss: _Loss):
         super().__init__()
         self.loss = loss
diff --git a/docs/amp.md b/docs/amp.md
index 8072849f5..40892f750 100644
--- a/docs/amp.md
+++ b/docs/amp.md
@@ -82,7 +82,7 @@ fp16 = dict(
 We leveraged the Megatron-LM implementation to achieve mixed precision training while maintaining compatibility with complex tensor 
 and pipeline parallelism. This AMP mode will cast all operations into fp16.
 
-The following conde block show a config file for this mode.
+The following code block shows a config file for this mode.
 
 ```python
 from colossalai.amp import AMP_TYPE