docs(*): add documentation and reST files for readthedocs (#272)

* add initial reST files for readthedocs * fix typos * docs refine and minor fix * add references for parallel training section * fix reST format * fix reST format * fix reST format * add comments for trainer API * add link to step-by-step quickstart guide * docs(code-docs/source/parallel.rst): add paper link url * docs(code-docs/source/parallel.rst): add paper link url * use MyST to render markdown * docs(code-docs/source/initialize.rst): update model init * add requirements for myst-parser * reuse install and usage markdown * docs(code-docs/source/index.rst): add example and q&a * docs(doc/code-docs/*): docs refine * docs(code-docs/source/parallel.rst): update docs for zero config * docs(code-docs/source/example.rst): fix typos for example.rst * docs(code-docs/source/example.rst): refine docs * docs(code-docs/source/example): update example * docs(code-docs/source/example): delete useless example * docs(code-docs/source/*): fix image display issue * docs(code-docs/source/parallel.rst): add docs for communication overlap * docs(code-docs/source/conf.py): update conf.py * docs(code-docs/source/example): update example 30B demo * docs(code-docs/source/parallel.rst): update pipeline parallel * docs(code-docs/source/parallel.rst): update pipeline parallel * docs(code-docs/source/parallel.rst): update pipeline parallel * docs(code-docs/source/parallel.rst): update pipeline parallel * docs(code-docs/source/parallel.rst): update ZeRO1.5 * docs(code-docs/source/parallel.rst): update ZeRO1.5 * docs(code-docs/source): fix word spelling error --------- Co-authored-by: huangting4201 <huangting3@sensetime.com>
2023-09-06 15:36:03 +08:00 · 2023-09-06 15:36:03 +08:00 · b6d909d43e
parent 7f61505fa0
commit b6d909d43e
28 changed files with 755 additions and 110 deletions
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@ -1,6 +1,5 @@
 Sphinx
 sphinx-autobuild
-recommonmark
 sphinx_rtd_theme
 sphinx_markdown_tables
 autodoc_pydantic==1.9
@ -8,3 +7,5 @@ enum_tools
 numpy
 torch
 tqdm
+pyecharts
+myst-parser
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@ -1,2 +1,12 @@
 Model Checkpointing
 ===================
+
+InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation, 
+we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports 
+automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit.
+
+Checkpointing
+-------------
+
+.. autoclass:: internlm.utils.model_checkpoint.CheckpointManager
+    :members:
--- a/doc/code-docs/source/conf.py
+++ b/doc/code-docs/source/conf.py
@ -12,19 +12,25 @@ import sys
 project = "InternLM"
 copyright = "2023, InternLM Team"
 author = "InternLM Team"
-release = "v0.2.0"
+
+with open("../../../version.txt", "r") as f:
+    release = f.readline().rstrip()
+
+master_doc = 'index'
+
+autodoc_member_order = 'bysource'

 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

 extensions = [
-    "recommonmark",
    "sphinx_rtd_theme",
    "sphinx.ext.viewcode",
    "sphinx.ext.autodoc",
    "sphinxcontrib.autodoc_pydantic",
    "sphinx.ext.autosectionlabel",
    "sphinx.ext.napoleon",
+    "myst_parser",
 ]

 pygments_style = "sphinx"
@ -71,7 +77,7 @@ html_static_path = ["_static"]
 # GitHub integration
 html_context = {
    "display_github": True,
-    "github_user": "pjlab",
+    "github_user": "InternLM",
    "github_repo": "InternLM",
    "github_version": "master",
    "conf_py_path": "/doc/code-docs/source/",
--- a/doc/code-docs/source/example/30B_demo.rst
+++ b/doc/code-docs/source/example/30B_demo.rst
@ -0,0 +1,203 @@
+30B Demo
+================
+
+Training Config
+----------------
+
+30B demo config file example:
+
+.. code-block:: python
+
+    JOB_NAME = "30b_train"
+
+    SEQ_LEN = 2048
+    HIDDEN_SIZE = 6144
+    NUM_ATTENTION_HEAD = 48
+    MLP_RATIO = 8 / 3
+    NUM_LAYER = 60
+    VOCAB_SIZE = 103168
+
+    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+    # Ckpt folder format:
+    # fs: 'local:/mnt/nfs/XXX'
+    SAVE_CKPT_FOLDER = "local:llm_ckpts"
+    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+    # boto3 Ckpt folder format:
+    # import os
+    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+    CHECKPOINT_EVERY = 50
+    ckpt = dict(
+        enable_save_ckpt=False,  # enable ckpt save.
+        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
+        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
+        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+        checkpoint_every=CHECKPOINT_EVERY,
+        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
+        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    )
+
+    TRAIN_FOLDER = "/path/to/dataset"
+    VALID_FOLDER = "/path/to/dataset"
+    data = dict(
+        seq_len=SEQ_LEN,
+        # micro_num means the number of micro_batch contained in one gradient update
+        micro_num=4,
+        # packed_length = micro_bsz * SEQ_LEN
+        micro_bsz=2,
+        # defaults to the value of micro_num
+        valid_micro_num=4,
+        # defaults to 0, means disable evaluate
+        valid_every=50,
+        pack_sample_into_one=False,
+        total_steps=50000,
+        skip_batches="",
+        rampup_batch_size="",
+        # Datasets with less than 50 rows will be discarded
+        min_length=50,
+        # train_folder=TRAIN_FOLDER,
+        # valid_folder=VALID_FOLDER,
+    )
+
+    grad_scaler = dict(
+        fp16=dict(
+            # the initial loss scale, defaults to 2**16
+            initial_scale=2**16,
+            # the minimum loss scale, defaults to None
+            min_scale=1,
+            # the number of steps to increase loss scale when no overflow occurs
+            growth_interval=1000,
+        ),
+        # the multiplication factor for increasing loss scale, defaults to 2
+        growth_factor=2,
+        # the multiplication factor for decreasing loss scale, defaults to 0.5
+        backoff_factor=0.5,
+        # the maximum loss scale, defaults to None
+        max_scale=2**24,
+        # the number of overflows before decreasing loss scale, defaults to 2
+        hysteresis=2,
+    )
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+    loss = dict(
+        label_smoothing=0,
+    )
+
+    adam = dict(
+        lr=1e-4,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_beta2_c=0,
+        adam_eps=1e-8,
+        weight_decay=0.01,
+    )
+
+    lr_scheduler = dict(
+        total_steps=data["total_steps"],
+        init_steps=0,  # optimizer_warmup_step
+        warmup_ratio=0.01,
+        eta_min=1e-5,
+        last_epoch=-1,
+    )
+
+    beta2_scheduler = dict(
+        init_beta2=adam["adam_beta2"],
+        c=adam["adam_beta2_c"],
+        cur_iter=-1,
+    )
+
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+    """
+    zero1 parallel:
+        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    pipeline parallel (dict):
+        1. size: int, the size of pipeline parallel.
+        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+    tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    """
+    parallel = dict(
+        zero1=-1,
+        tensor=4,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+    cudnn_deterministic = False
+    cudnn_benchmark = False
+
+
+Start Training
+----------------
+
+After completing the data preparation and relevant training configurations, you can start the demo training.
+The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs.
+
+.. code-block:: bash
+
+    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/30B_sft.py
+
+Training Results
+----------------
+
+Taking the configuration of the demo training on two nodes with 16 GPUs on slurm as an example, the training result log is shown below:
+
+.. code-block:: bash
+
+    2023-09-06 10:29:26,629 INFO parallel_context.py:508 in set_device -- process rank 10 is bound to host:HOST-10-140-66-20 device: 2
+    2023-09-06 10:29:26,632 INFO parallel_context.py:508 in set_device -- process rank 11 is bound to host:HOST-10-140-66-20 device: 3
+    2023-09-06 10:29:26,634 INFO parallel_context.py:508 in set_device -- process rank 12 is bound to host:HOST-10-140-66-20 device: 4
+    2023-09-06 10:29:26,636 INFO parallel_context.py:508 in set_device -- process rank 9 is bound to host:HOST-10-140-66-20 device: 1
+    2023-09-06 10:29:26,640 INFO parallel_context.py:508 in set_device -- process rank 15 is bound to host:HOST-10-140-66-20 device: 7
+    2023-09-06 10:29:26,639 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:HOST-10-140-66-9 device: 0
+    2023-09-06 10:29:26,641 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:HOST-10-140-66-9 device: 2
+    2023-09-06 10:29:26,643 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:HOST-10-140-66-9 device: 5
+    2023-09-06 10:29:26,645 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:HOST-10-140-66-9 device: 6
+    2023-09-06 10:29:26,661 INFO parallel_context.py:508 in set_device -- process rank 13 is bound to host:HOST-10-140-66-20 device: 5
+    2023-09-06 10:29:26,707 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:HOST-10-140-66-9 device: 1
+    2023-09-06 10:29:26,826 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:HOST-10-140-66-9 device: 4
+    2023-09-06 10:29:26,871 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:HOST-10-140-66-9 device: 7
+    2023-09-06 10:29:26,932 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:HOST-10-140-66-9 device: 3
+    2023-09-06 10:29:27,156 INFO parallel_context.py:508 in set_device -- process rank 14 is bound to host:HOST-10-140-66-20 device: 6
+    2023-09-06 10:29:27,271 INFO parallel_context.py:508 in set_device -- process rank 8 is bound to host:HOST-10-140-66-20 device: 0
+    2023-09-06 10:29:32,060 INFO launch.py:329 in launch -- Distributed environment is initialized, data parallel size: 4, pipeline parallel size: 1, tensor parallel size: 4
+    2023-09-06 10:30:06,141 INFO hybrid_zero_optim.py:291 in _partition_param_list -- Number of elements on ranks: [1782007296, 1812307968, 1812307968, 1706469888], rank:0
+    2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan 
+    2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan 
+    2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan
--- a/doc/code-docs/source/example/7B_demo.rst
+++ b/doc/code-docs/source/example/7B_demo.rst
@ -0,0 +1,193 @@
+7B Demo
+================
+
+Training Config
+----------------
+
+7B demo config file example:
+
+.. code-block:: python
+
+    JOB_NAME = "7b_train"
+
+    SEQ_LEN = 2048
+    HIDDEN_SIZE = 4096
+    NUM_ATTENTION_HEAD = 32
+    MLP_RATIO = 8 / 3
+    NUM_LAYER = 32
+    VOCAB_SIZE = 103168
+
+    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+    # Ckpt folder format:
+    # fs: 'local:/mnt/nfs/XXX'
+    SAVE_CKPT_FOLDER = "local:llm_ckpts"
+    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+    # boto3 Ckpt folder format:
+    # import os
+    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+    CHECKPOINT_EVERY = 50
+    ckpt = dict(
+        enable_save_ckpt=False,  # enable ckpt save.
+        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
+        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
+        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+        checkpoint_every=CHECKPOINT_EVERY,
+        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
+        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    )
+
+    TRAIN_FOLDER = "/path/to/dataset"
+    VALID_FOLDER = "/path/to/dataset"
+    data = dict(
+        seq_len=SEQ_LEN,
+        # micro_num means the number of micro_batch contained in one gradient update
+        micro_num=4,
+        # packed_length = micro_bsz * SEQ_LEN
+        micro_bsz=2,
+        # defaults to the value of micro_num
+        valid_micro_num=4,
+        # defaults to 0, means disable evaluate
+        valid_every=50,
+        pack_sample_into_one=False,
+        total_steps=50000,
+        skip_batches="",
+        rampup_batch_size="",
+        # Datasets with less than 50 rows will be discarded
+        min_length=50,
+        # train_folder=TRAIN_FOLDER,
+        # valid_folder=VALID_FOLDER,
+    )
+
+    grad_scaler = dict(
+        fp16=dict(
+            # the initial loss scale, defaults to 2**16
+            initial_scale=2**16,
+            # the minimum loss scale, defaults to None
+            min_scale=1,
+            # the number of steps to increase loss scale when no overflow occurs
+            growth_interval=1000,
+        ),
+        # the multiplication factor for increasing loss scale, defaults to 2
+        growth_factor=2,
+        # the multiplication factor for decreasing loss scale, defaults to 0.5
+        backoff_factor=0.5,
+        # the maximum loss scale, defaults to None
+        max_scale=2**24,
+        # the number of overflows before decreasing loss scale, defaults to 2
+        hysteresis=2,
+    )
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+    loss = dict(
+        label_smoothing=0,
+    )
+
+    adam = dict(
+        lr=1e-4,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_beta2_c=0,
+        adam_eps=1e-8,
+        weight_decay=0.01,
+    )
+
+    lr_scheduler = dict(
+        total_steps=data["total_steps"],
+        init_steps=0,  # optimizer_warmup_step
+        warmup_ratio=0.01,
+        eta_min=1e-5,
+        last_epoch=-1,
+    )
+
+    beta2_scheduler = dict(
+        init_beta2=adam["adam_beta2"],
+        c=adam["adam_beta2_c"],
+        cur_iter=-1,
+    )
+
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+    """
+    zero1 parallel:
+        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    pipeline parallel (dict):
+        1. size: int, the size of pipeline parallel.
+        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+    tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    """
+    parallel = dict(
+        zero1=8,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+    cudnn_deterministic = False
+    cudnn_benchmark = False
+
+Start Training
+----------------
+
+After completing the data preparation and relevant training configurations, you can start the demo training.
+The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs.
+
+.. code-block:: bash
+
+    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+
+Training Results
+----------------
+
+Taking the configuration of the demo training on a single machine with 8 GPUs on slurm as an example, the training result log is shown below:
+
+.. code-block:: bash
+
+    2023-09-05 11:47:44,649 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:SH-IDC1-10-140-1-110 device: 4
+    2023-09-05 11:47:44,650 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:SH-IDC1-10-140-1-110 device: 3
+    2023-09-05 11:47:44,651 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:SH-IDC1-10-140-1-110 device: 6
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:SH-IDC1-10-140-1-110 device: 7
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:SH-IDC1-10-140-1-110 device: 5
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:SH-IDC1-10-140-1-110 device: 1
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:SH-IDC1-10-140-1-110 device: 2
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:SH-IDC1-10-140-1-110 device: 0
+    2023-09-05 11:47:51,006 INFO launch.py:354 in launch -- Distributed environment is initialized, data parallel size: 8, pipeline parallel size: 1, tensor parallel size: 1
+    2023-09-05 11:49:09,855 INFO hybrid_zero_optim.py:294 in _partition_param_list -- Number of elements on ranks: [894509056, 944865280, 966909952, 966909952, 966909952, 944865280, 966909952, 670068736], rank:0
+    2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan 
--- a/doc/code-docs/source/example/index.rst
+++ b/doc/code-docs/source/example/index.rst
@ -0,0 +1,18 @@
+Training Example
+================
+
+7B Demo
+------------
+
+.. toctree::
+   :maxdepth: 2
+
+   7B_demo
+
+30B Demo
+------------
+
+.. toctree::
+   :maxdepth: 2
+
+   30B_demo
--- a/doc/code-docs/source/index.rst
+++ b/doc/code-docs/source/index.rst
@ -3,6 +3,7 @@
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

+
 InternLM
 ========

@ -14,6 +15,14 @@ Environment Setup

   install

+Quickstart Guide
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   usage
+
 Model Setup
 -------------------

@ -39,7 +48,7 @@ Parallel Training
   parallel

 Model Checkpointing
-------------------
+--------------------

 .. toctree::
   :maxdepth: 2
@ -62,6 +71,22 @@ Monitor

   monitor

+Example
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   example/index
+
+Q&A
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   qa
+
 Indices and tables
 ==================

--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@ -20,16 +20,71 @@ parser with some builtin arguments, users can add custom parameters to this pars
 .. autofunction:: internlm.initialize.get_default_parser


-.. _InternLM-init:
+.. _InternLM-model-init:

 Model Initialization
 -------------------------

+.. autofunction:: internlm.train.initialize_model
+
+InternLM uses the field ``model_type`` and ``model`` in the config file to control model initialization process. An example model initialization configuration 
+can be defined as follows:
+
+.. code-block:: python
+
+    model_type = "INTERNLM"  # default is "INTERNLM", used to register classes and modules for model initialization
+    NUM_ATTENTION_HEAD = 32
+    VOCAB_SIZE = 103168
+    HIDDEN_SIZE = 4096
+    NUM_LAYER = 32
+    MLP_RATIO = 8 / 3
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+
+- The field ``model_type`` specifics the model type has been registered and to be initialized.
+- The parameters in field ``model`` specific the configuration settings during model initialization.
+
+It is worth noting that, users can define new model type, and register model's initialization function by decorater ``@MODEL_INITIALIZER.register_module``, which ``MODEL_INITIALIZER`` is an instantiated object of class ``internlm.util.registry.Registry``, the example is shown as follows.
+
+.. code-block:: python
+
+    MODEL_TYPE = "NEW_MODEL"
+
+    @MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
+    def build_new_model_with_cfg(*args, **kwargs):
+
+.. _InternLM-optim-init:
+
 Optimizer Initialization
 -------------------------

+.. autofunction:: internlm.train.initialize_optimizer
+
+.. _InternLM-dl-init:
+
 Dataloader Initialization
 -------------------------

+.. autofunction:: internlm.train.get_train_data_loader
+
+.. _InternLM-trainer-init:
+
 Trainer Initialization
 -------------------------
+
+.. autofunction:: internlm.initialize.initialize_trainer
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@ -1,70 +1,2 @@
-## Installation
-
-### Environment Preparation
-The required packages and corresponding version are shown as follows:
- Python == 3.10
- GCC == 10.2.0
- MPFR == 4.1.0
- CUDA >= 11.7
- Pytorch >= 1.13.1
- Transformers >= 4.28.0
- Flash-Attention >= v1.0.5
- Apex == 23.05
- GPU with Ampere or Hopper architecture (such as H100, A100)
- Linux OS
-
-After installing the above dependencies, some system environment variables need to be updated:
-```bash
-export CUDA_PATH={path_of_cuda_11.7}
-export GCC_HOME={path_of_gcc_10.2.0}
-export MPFR_HOME={path_of_mpfr_4.1.0}
-export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
-export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
-export CC=${GCC_HOME}/bin/gcc
-export CXX=${GCC_HOME}/bin/c++
-```
-
-### Environment Installation
-Clone the project `internlm` and its dependent submodules from the github repository, as follows:
-```bash
-git clone git@github.com:InternLM/InternLM.git --recurse-submodules
-```
-
-It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
-```bash
-conda create --name internlm-env python=3.10 -y
-conda activate internlm-env
-cd internlm
-pip install -r requirements/torch.txt 
-pip install -r requirements/runtime.txt 
-```
-
-Install flash-attention (version v1.0.5):
-```bash
-cd ./third_party/flash-attention
-python setup.py install
-cd ./csrc
-cd fused_dense_lib && pip install -v .
-cd ../xentropy && pip install -v .
-cd ../rotary && pip install -v .
-cd ../layer_norm && pip install -v .
-cd ../../../../
-```
-
-Install Apex (version 23.05):
-```bash
-cd ./third_party/apex
-pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-cd ../../
-```
-
-### Environment Image
-Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
-
-```bash
-# pull image
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# start container
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+```{include} ../../en/install.md
 ```
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@ -1,10 +1,26 @@
 Monitor and Alert
 =================

-
 Monitoring
 -----------------

+InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` to initialize context monitor. During this time, 
+a singleton ``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread and track training status 
+with ``internlm.monitor.monitor.MonitorTracker``. 
+
+.. autofunction:: internlm.monitor.monitor.initialize_monitor_manager
+
+.. autoclass:: internlm.monitor.monitor.MonitorManager
+    :members:
+
+.. autoclass:: internlm.monitor.monitor.MonitorTracker
+    :members:

 Alerting
 -----------------
+
+InternLM monitor thread periodically tracks loss spike, potential stuck condition, runtime exception, and SIGTERM signal.
+When above situation occurs, an alert will be triggered and a message will be sent to the Feishu webhook address by calling
+``internlm.monitor.alert.send_feishu_msg_with_webhook()``
+
+.. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@ -1,23 +1,158 @@
 Parallel Training
-=================
+==================

-.. 整体说一下并行配置使用方式，接下来再分模块详细说明
+.. Brief introduction to training parallelism, and how-to guide about config setting
+
+InternLM supports tensor parallel, pipeline parallel, sequence parallel, data parallel, and ZeRO1.5 to parallelize the training pipeline. 
+When initializing the distributed environment, we need to specify tensor parallel size, pipeline parallel size, data parallel size, 
+and ZeRO1.5 strategy. 
+
+The parallel setting of InternLM is fully config-driven, and you can change the parallelism by modifying 
+`config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_. An exmaple parallel training configuration can be defined as follows:
+
+.. code-block:: python
+
+    parallel = dict(
+        zero1=8,
+        tensor=1,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+- zero1: zero parallel strategy, divided into the following three cases, the default value is -1
+
+    - When ``size <= 0``, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
+    - When ``size == 1``, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
+    - When ``size > 1`` and ``size <= data_parallel_world_size``, the zero1 process group is a subset of the data parallel process group.
+
+- tensor: tensor parallel size, usually the number of GPUs per node, the default value is 1
+- pipeline: pipeline parallel strategy
+
+    - size: pipeline parallel size, the default value is 1
+    - interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
+
+- sequence_parallel: whether to enable sequence parallelism, the default value is False
+
+Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`

 Tensor Parallel
 -----------------

+The implementation of tensor parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_, which has tensor 
+parallel extensions to parallelize `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ and 
+`linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ blocks in InternLM model. 
+
+To use tensor parallel, you need to set the value of tensor parallel size ``parallel.tensor`` in the config file, which is usually the number of GPUs per node.
+
+.. figure:: ../../imgs/tensor_parallel.png
+  :scale: 50%
+  :class: with-border
+
+  Tensor parallel, adopted from `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_

 Pipeline Parallel
 -----------------

+InternLM uses `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ (one forward pass followed by one backward pass) for pipeline parallel. For 1F1B strategy, there are two implementations: 
+(1) non-interleaved scheduler, which is memory-efficient (2) interleaved scheduler, which is both memory-efficient and time-efficient.
+
+.. figure:: ../../imgs/pipeline_schedule.png
+  :scale: 45%
+  :class: with-border
+
+  Non-interleaved and interleaved scheduler for 1F1B pipeline parallelism, adopted from `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_
+
+scheduler for non-interleaved 1F1B strategy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To use non-interleaved pipeline scheduler, you need to set ``model.num_chunks = 1`` in the config file.
+
+.. autoclass:: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
+    :members:
+
+scheduler for interleaved 1F1B strategy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To use interleaved pipeline scheduler, you need to set ``model.num_chunks > 1`` in the config file.
+
+.. autoclass:: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler
+    :members:
+
+Also, to enable communication overlap when using interleaved pipeline scheduler, you need to set ``parallel.pipeline.interleaved_overlap = True`` 
+in the config file.
+
+When ``parallel.pipeline.interleaved_overlap = True``, function ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap`` will be called and 
+``internlm.core.communication.AsynCommunicator`` will be created for managing async communication. Asynchronous communication will be enabled in 1F1B stage to make full 
+use of uplink/downlink bandwidth and achieve communication overlap. 
+
+The difference between 1F1B stage without overlap and 1F1B stage with overlap is shown as follows:
+
+The 1F1B stage without overlap consists of the following steps:
+
+.. code-block:: bash
+
+    1. Perform the forward pass.
+    2. Perform the backward pass.
+    3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration to the previous stage, and receive the forward and backward inputs for the next iteration.
+
+The 1F1B stage with overlap consists of the following steps:
+
+.. code-block:: bash
+
+    1. Perform the forward pass.
+    2. Check if the backward input is ready.
+    3. Send the forward output and receive the forward input for the next iteration.
+    4. Perform the backward pass.
+    5. Check if the forward input is ready.
+    6. Send the backward output and receive the backward input for the next iteration.
+

 Sequence Parallel
 -----------------

+Sequence parallel is a technique to reduce activation memory in layer norm and dropout without additional computation, communication or memory overhead.
+The implementation of sequence parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_. 
+
+To enable sequence parallel, you need to set ``parallel.sequence_parallel = True`` in the config file.
+
+.. figure:: ../../imgs/sequence_parallel.png
+  :scale: 50%
+  :class: with-border
+
+  Sequence parallel, adopted from flash-attention

 Data Parallel
 -----------------

+InternLM supports data parallel. For data parallel:
+
+`Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`

 ZeRO1.5
 -----------------
+
+The implementation of ZeRO1.5 uses the concept of hierarchical sharding via config value ``parallel.zero1``, which enables sharding within local nodes.
+
+1. If ``parallel.zero1 <= 0``, the size of the zero process group is equal to the size of the dp process group, so parameters will be divided within the range of dp.
+2. If ``parallel.zero1 == 1``, zero is not used, and all dp groups retain the full amount of model parameters.
+3. If ``parallel.zero1 > 1`` and ``parallel.zero1 <= dp world size``, the world size of zero is a subset of dp world size. For smaller models, it is usually a better choice to split the parameters within nodes with a setting ``parallel.zero1 <= 8``.
+
+Furthermore, you can enable communication-computation overlap, set bucket reduce size, gradient clipping parameters in the config file.
+
+.. code-block:: python
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,  
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+There are two communication optimizations worth paying attention to here:
+
+- overlap_sync_grad: If set True, overlapping training backward pass with gradients' all-reduce communication
+- overlap_sync_param: If set True, overlapping parameters' broadcast communication with next step's forward pass
+
+.. autoclass:: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer
+    :members:
--- a/doc/code-docs/source/profiler.rst
+++ b/doc/code-docs/source/profiler.rst
@ -1,11 +1,29 @@
 Profiler
 ========

-.. 可介绍torch profiler, memory profiler的使用
+.. Mainly about the usage of torch profiler and memory profiler

 Torch Profiler
 -----------------
+InternLM uses ``internlm.train.initialize_llm_profile()`` to profile performance data, execution time duration and breakdown analysis of 
+step time. The implementation is based on `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ and output tracing files can 
+be visualized with `tensorboard <https://www.tensorflow.org>`_.

+To use this torch profiler tool, you need to enable profiling by passing the ``--profiling`` flag when starting training. After torch 
+profiling is completed, you can find the profiling results in the ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` folder.
+
+.. autofunction:: internlm.train.initialize_llm_profile

 Memory Profiler
 -----------------
+
+InternLM provides a practical solution ``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` to monitor actual GPU memory usage.
+In the implmentation, model data (including model parameters, model gradients, and optimizer states) and non-model data 
+(including activations) are calculated.
+
+To use this memory profiler tool, you need to enable profiling by passing the ``--profiling`` flag when starting training. After memory 
+profiling is completed, you can find the profiling results (including logs of memory usage at different time point and sunburst charts 
+showing overall memory usage) for a specific rank device in the ``memory_trace/rank{}_dp{}_tp{}`` folder.
+
+.. autoclass:: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
+    :members:
--- a/doc/code-docs/source/qa.rst
+++ b/doc/code-docs/source/qa.rst
@ -0,0 +1,2 @@
+Q&A
+===
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@ -1,2 +1,10 @@
 Training API
 ============
+
+InternLM training API is managed in ``internlm.core.trainer.Trainer``. After defining the training engine and runtime scheduler, 
+we can call training API to perform training, evaluation, zero gradients and parameter update steps.
+
+For detailed usage, please refer to Trainer API documentation and examples.
+
+.. autoclass:: internlm.core.trainer.Trainer
+    :members:
--- a/doc/code-docs/source/usage.md
+++ b/doc/code-docs/source/usage.md
@ -0,0 +1,4 @@
+```{include} ../../en/usage.md
+:relative-docs: docs/
+:relative-images:
+```
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -1,4 +1,4 @@
-## InternLM Installation
+## Installation

 ### Environment Preparation
 The required packages and corresponding version are shown as follows:
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@ -1,4 +1,4 @@
-## Pre-training and Fine-tuning Tutorial for InternLM
+## Quickstart Guide for Pre-training and Fine-tuning

 To start a demo model training, you need to prepare three things: **installation**, **dataset preparation**, and **model training configuration**. In this guide, we will first cover the steps for dataset preparation and then briefly describe the model training configuration.

@ -93,10 +93,7 @@ data = dict(
 )
 ```

-<div align="left">
-    <img src="../imgs/pack_into_one.png" width="550"/>
-</div>
-
+![pack_into_one](../imgs/pack_into_one.png)

 Currently, it supports passing the dataset file path `train_folder`, and the file format is required to be as follows:

--- a/doc/imgs/pipeline_schedule.png
+++ b/doc/imgs/pipeline_schedule.png
--- a/doc/imgs/sequence_parallel.png
+++ b/doc/imgs/sequence_parallel.png
--- a/doc/imgs/tensor_parallel.png
+++ b/doc/imgs/tensor_parallel.png
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@ -25,13 +25,13 @@ class NonPipelineScheduler(BaseScheduler):
        gradient_accumulation_steps(int, optional): the steps of gradient accumulation, 1 for disable
            gradient accumulation.

-    Example:
-        # this shows an example of customized data_process_func
-        def data_process_func(dataloader_output):
-            item1, item2, item3 = dataloader_output
-            data = (item1, item2)
-            label = item3
-            return data, label
+    Examples:
+        >>> # this shows an example of customized data_process_func
+        >>> def data_process_func(dataloader_output):
+        >>>     item1, item2, item3 = dataloader_output
+        >>>     data = (item1, item2)
+        >>>     label = item3
+        >>>     return data, label
    """

    def __init__(
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@ -1071,8 +1071,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        1. Perform the forward pass.
        2. Perform the backward pass.
        3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration
-           to the previous stage,
-        and receive the forward and backward inputs for the next iteration.
+           to the previous stage, and receive the forward and backward inputs for the next iteration.

        Args:
            engine (Engine): The engine to use for computation.
--- a/internlm/core/trainer.py
+++ b/internlm/core/trainer.py
@ -131,10 +131,12 @@ class Trainer:

    @property
    def engine(self):
+        """Returns the engine that responsible for managing the training and evaluation process."""
        return self._engine

    @property
    def schedule(self):
+        """Returns the runtime scheduler."""
        return self._schedule

    @property
@ -143,15 +145,19 @@ class Trainer:
        return isinstance(self._schedule, (PipelineScheduler, InterleavedPipelineScheduler))

    def train(self):
+        """Sets the model to training mode."""
        self._engine.train()

    def eval(self):
+        """Sets the model to evaluation mode."""
        self._engine.eval()

    def zero_grad(self):
+        """Sets the gradient of all parameters in the model to zero."""
        self._engine.zero_grad()

    def step(self):
+        """Executes the parameter update step."""
        return self._engine.step()

    def execute_schedule(self, data_iter: Iterable, **kwargs):
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@ -43,8 +43,8 @@ def initialize_trainer(
    loaded into gpc.config.

    Args:
-        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
-        optimizer (:class:`BaseOptimizer`.
+        model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
+        optimizer (:class:`BaseOptimizer`): Your optimizer for training.
        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -22,7 +22,7 @@ def get_default_parser():
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.

    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+       Parser: Returns the parser with the default arguments, the user may add customized arguments into this parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, help="path to the config file")
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -451,10 +451,9 @@ def build_model_with_cfg(
    use_scaled_init: bool = True,
    use_swiglu: bool = True,
    use_flash_attn: bool = True,
-    sequence_parallel: bool = False,  # pylint: disable=W0613
 ):
    """
-    Builde model with config
+    Build model with config.

    Args:
        num_chunks (int): The number of partitions in pipeline parallel. 1 by default.
--- a/internlm/monitor/monitor.py
+++ b/internlm/monitor/monitor.py
@ -211,6 +211,14 @@ monitor_manager = MonitorManager()

@contextmanager
 def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
+    """
+    Initialize monitor manager for monitoring training lifetime and alerting exception info to Feishu.
+
+    Args:
+        job_name (str): The training job name.
+        alert_address (str): The Feishu webhook address for sending alert messages.
+    """
+
    if alert_address is not None:
        try:
            monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address)
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -45,9 +45,11 @@ logger = get_logger(__file__)

 def initialize_model():
    """
-    Initialize model.
+    Initialize model with Automatic Mixed Precision.

-    Returns: The neural network model to be trained or evaluated.
+    Returns:
+        torch.nn.Module:
+            The neural network model to be trained or evaluated.
    """

    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
@ -88,9 +90,10 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    Initialize optimizer.

    Args:
-        model (torch.nn.Module): Your model instance to be trained or evaluated.
+        model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated.

-    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
+    Returns:
+        A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
        param_bcast_sync_handler = ParamBcastSyncHandler(model)
@ -125,7 +128,14 @@ def get_train_data_loader(
    """
    Generate and return the training data loader.

-    Returns: A tuple of (train_dl, dataset_types).
+    Args:
+        num_worker (:class:`int`): number of subprocesses used for dataloader.
+        dataset_generate_func (:class:`Callable`, optional): generate function for dataset.
+        train_sampler (:class:`torch.utils.data.sampler`, optional): dataset sampler for training dataloader.
+        train_collate_fn (:class:`Callable`, optional): collate function for training dataloader.
+
+    Returns:
+        A tuple of (train_dl, dataset_types).
    """

    # Get the dataset types