docs(*): add documentation and reST files for readthedocs (#272)

* add initial reST files for readthedocs

* fix typos

* docs refine and minor fix

* add references for parallel training section

* fix reST format

* fix reST format

* fix reST format

* add comments for trainer API

* add link to step-by-step quickstart guide

* docs(code-docs/source/parallel.rst): add paper link url

* docs(code-docs/source/parallel.rst): add paper link url

* use MyST to render markdown

* docs(code-docs/source/initialize.rst): update model init

* add requirements for myst-parser

* reuse install and usage markdown

* docs(code-docs/source/index.rst): add example and q&a

* docs(doc/code-docs/*): docs refine

* docs(code-docs/source/parallel.rst): update docs for zero config

* docs(code-docs/source/example.rst): fix typos for example.rst

* docs(code-docs/source/example.rst): refine docs

* docs(code-docs/source/example): update example

* docs(code-docs/source/example): delete useless example

* docs(code-docs/source/*): fix image display issue

* docs(code-docs/source/parallel.rst): add docs for communication overlap

* docs(code-docs/source/conf.py): update conf.py

* docs(code-docs/source/example): update example 30B demo

* docs(code-docs/source/parallel.rst): update pipeline parallel

* docs(code-docs/source/parallel.rst): update pipeline parallel

* docs(code-docs/source/parallel.rst): update pipeline parallel

* docs(code-docs/source/parallel.rst): update pipeline parallel

* docs(code-docs/source/parallel.rst): update ZeRO1.5

* docs(code-docs/source/parallel.rst): update ZeRO1.5

* docs(code-docs/source): fix word spelling error

---------

Co-authored-by: huangting4201 <huangting3@sensetime.com>
pull/273/head
Season 2023-09-06 15:36:03 +08:00 committed by GitHub
parent 7f61505fa0
commit b6d909d43e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 755 additions and 110 deletions

View File

@ -1,6 +1,5 @@
Sphinx
sphinx-autobuild
recommonmark
sphinx_rtd_theme
sphinx_markdown_tables
autodoc_pydantic==1.9
@ -8,3 +7,5 @@ enum_tools
numpy
torch
tqdm
pyecharts
myst-parser

View File

@ -1,2 +1,12 @@
Model Checkpointing
===================
InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to manage model checkpointing. In the implementation,
we use ``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint training states at specific steps. InternLM supports
automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit.
Checkpointing
-------------
.. autoclass:: internlm.utils.model_checkpoint.CheckpointManager
:members:

View File

@ -12,19 +12,25 @@ import sys
project = "InternLM"
copyright = "2023, InternLM Team"
author = "InternLM Team"
release = "v0.2.0"
with open("../../../version.txt", "r") as f:
release = f.readline().rstrip()
master_doc = 'index'
autodoc_member_order = 'bysource'
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
"recommonmark",
"sphinx_rtd_theme",
"sphinx.ext.viewcode",
"sphinx.ext.autodoc",
"sphinxcontrib.autodoc_pydantic",
"sphinx.ext.autosectionlabel",
"sphinx.ext.napoleon",
"myst_parser",
]
pygments_style = "sphinx"
@ -71,7 +77,7 @@ html_static_path = ["_static"]
# GitHub integration
html_context = {
"display_github": True,
"github_user": "pjlab",
"github_user": "InternLM",
"github_repo": "InternLM",
"github_version": "master",
"conf_py_path": "/doc/code-docs/source/",

View File

@ -0,0 +1,203 @@
30B Demo
================
Training Config
----------------
30B demo config file example:
.. code-block:: python
JOB_NAME = "30b_train"
SEQ_LEN = 2048
HIDDEN_SIZE = 6144
NUM_ATTENTION_HEAD = 48
MLP_RATIO = 8 / 3
NUM_LAYER = 60
VOCAB_SIZE = 103168
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
# Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX'
SAVE_CKPT_FOLDER = "local:llm_ckpts"
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
# boto3 Ckpt folder format:
# import os
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
CHECKPOINT_EVERY = 50
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
# load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
load_optimizer=True, # Wheter to load optimizer states when continuing training.
checkpoint_every=CHECKPOINT_EVERY,
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]), # directory for snapshot ckpt storage path.
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
)
TRAIN_FOLDER = "/path/to/dataset"
VALID_FOLDER = "/path/to/dataset"
data = dict(
seq_len=SEQ_LEN,
# micro_num means the number of micro_batch contained in one gradient update
micro_num=4,
# packed_length = micro_bsz * SEQ_LEN
micro_bsz=2,
# defaults to the value of micro_num
valid_micro_num=4,
# defaults to 0, means disable evaluate
valid_every=50,
pack_sample_into_one=False,
total_steps=50000,
skip_batches="",
rampup_batch_size="",
# Datasets with less than 50 rows will be discarded
min_length=50,
# train_folder=TRAIN_FOLDER,
# valid_folder=VALID_FOLDER,
)
grad_scaler = dict(
fp16=dict(
# the initial loss scale, defaults to 2**16
initial_scale=2**16,
# the minimum loss scale, defaults to None
min_scale=1,
# the number of steps to increase loss scale when no overflow occurs
growth_interval=1000,
),
# the multiplication factor for increasing loss scale, defaults to 2
growth_factor=2,
# the multiplication factor for decreasing loss scale, defaults to 0.5
backoff_factor=0.5,
# the maximum loss scale, defaults to None
max_scale=2**24,
# the number of overflows before decreasing loss scale, defaults to 2
hysteresis=2,
)
hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=True,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)
loss = dict(
label_smoothing=0,
)
adam = dict(
lr=1e-4,
adam_beta1=0.9,
adam_beta2=0.95,
adam_beta2_c=0,
adam_eps=1e-8,
weight_decay=0.01,
)
lr_scheduler = dict(
total_steps=data["total_steps"],
init_steps=0, # optimizer_warmup_step
warmup_ratio=0.01,
eta_min=1e-5,
last_epoch=-1,
)
beta2_scheduler = dict(
init_beta2=adam["adam_beta2"],
c=adam["adam_beta2_c"],
cur_iter=-1,
)
model = dict(
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
num_attention_heads=NUM_ATTENTION_HEAD,
embed_split_hidden=True,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYER,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
)
"""
zero1 parallel:
1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
tensor parallel: tensor parallel size, usually the number of GPUs per node.
"""
parallel = dict(
zero1=-1,
tensor=4,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
cudnn_deterministic = False
cudnn_benchmark = False
Start Training
----------------
After completing the data preparation and relevant training configurations, you can start the demo training.
The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs.
.. code-block:: bash
srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/30B_sft.py
Training Results
----------------
Taking the configuration of the demo training on two nodes with 16 GPUs on slurm as an example, the training result log is shown below:
.. code-block:: bash
2023-09-06 10:29:26,629 INFO parallel_context.py:508 in set_device -- process rank 10 is bound to host:HOST-10-140-66-20 device: 2
2023-09-06 10:29:26,632 INFO parallel_context.py:508 in set_device -- process rank 11 is bound to host:HOST-10-140-66-20 device: 3
2023-09-06 10:29:26,634 INFO parallel_context.py:508 in set_device -- process rank 12 is bound to host:HOST-10-140-66-20 device: 4
2023-09-06 10:29:26,636 INFO parallel_context.py:508 in set_device -- process rank 9 is bound to host:HOST-10-140-66-20 device: 1
2023-09-06 10:29:26,640 INFO parallel_context.py:508 in set_device -- process rank 15 is bound to host:HOST-10-140-66-20 device: 7
2023-09-06 10:29:26,639 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:HOST-10-140-66-9 device: 0
2023-09-06 10:29:26,641 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:HOST-10-140-66-9 device: 2
2023-09-06 10:29:26,643 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:HOST-10-140-66-9 device: 5
2023-09-06 10:29:26,645 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:HOST-10-140-66-9 device: 6
2023-09-06 10:29:26,661 INFO parallel_context.py:508 in set_device -- process rank 13 is bound to host:HOST-10-140-66-20 device: 5
2023-09-06 10:29:26,707 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:HOST-10-140-66-9 device: 1
2023-09-06 10:29:26,826 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:HOST-10-140-66-9 device: 4
2023-09-06 10:29:26,871 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:HOST-10-140-66-9 device: 7
2023-09-06 10:29:26,932 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:HOST-10-140-66-9 device: 3
2023-09-06 10:29:27,156 INFO parallel_context.py:508 in set_device -- process rank 14 is bound to host:HOST-10-140-66-20 device: 6
2023-09-06 10:29:27,271 INFO parallel_context.py:508 in set_device -- process rank 8 is bound to host:HOST-10-140-66-20 device: 0
2023-09-06 10:29:32,060 INFO launch.py:329 in launch -- Distributed environment is initialized, data parallel size: 4, pipeline parallel size: 1, tensor parallel size: 4
2023-09-06 10:30:06,141 INFO hybrid_zero_optim.py:291 in _partition_param_list -- Number of elements on ranks: [1782007296, 1812307968, 1812307968, 1706469888], rank:0
2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan
2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan
2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan
2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan
2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan
2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan

View File

@ -0,0 +1,193 @@
7B Demo
================
Training Config
----------------
7B demo config file example:
.. code-block:: python
JOB_NAME = "7b_train"
SEQ_LEN = 2048
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
NUM_LAYER = 32
VOCAB_SIZE = 103168
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
# Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX'
SAVE_CKPT_FOLDER = "local:llm_ckpts"
LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
# boto3 Ckpt folder format:
# import os
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
CHECKPOINT_EVERY = 50
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
# load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
load_optimizer=True, # Wheter to load optimizer states when continuing training.
checkpoint_every=CHECKPOINT_EVERY,
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]), # directory for snapshot ckpt storage path.
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
)
TRAIN_FOLDER = "/path/to/dataset"
VALID_FOLDER = "/path/to/dataset"
data = dict(
seq_len=SEQ_LEN,
# micro_num means the number of micro_batch contained in one gradient update
micro_num=4,
# packed_length = micro_bsz * SEQ_LEN
micro_bsz=2,
# defaults to the value of micro_num
valid_micro_num=4,
# defaults to 0, means disable evaluate
valid_every=50,
pack_sample_into_one=False,
total_steps=50000,
skip_batches="",
rampup_batch_size="",
# Datasets with less than 50 rows will be discarded
min_length=50,
# train_folder=TRAIN_FOLDER,
# valid_folder=VALID_FOLDER,
)
grad_scaler = dict(
fp16=dict(
# the initial loss scale, defaults to 2**16
initial_scale=2**16,
# the minimum loss scale, defaults to None
min_scale=1,
# the number of steps to increase loss scale when no overflow occurs
growth_interval=1000,
),
# the multiplication factor for increasing loss scale, defaults to 2
growth_factor=2,
# the multiplication factor for decreasing loss scale, defaults to 0.5
backoff_factor=0.5,
# the maximum loss scale, defaults to None
max_scale=2**24,
# the number of overflows before decreasing loss scale, defaults to 2
hysteresis=2,
)
hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=True,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)
loss = dict(
label_smoothing=0,
)
adam = dict(
lr=1e-4,
adam_beta1=0.9,
adam_beta2=0.95,
adam_beta2_c=0,
adam_eps=1e-8,
weight_decay=0.01,
)
lr_scheduler = dict(
total_steps=data["total_steps"],
init_steps=0, # optimizer_warmup_step
warmup_ratio=0.01,
eta_min=1e-5,
last_epoch=-1,
)
beta2_scheduler = dict(
init_beta2=adam["adam_beta2"],
c=adam["adam_beta2_c"],
cur_iter=-1,
)
model = dict(
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
num_attention_heads=NUM_ATTENTION_HEAD,
embed_split_hidden=True,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYER,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
)
"""
zero1 parallel:
1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
tensor parallel: tensor parallel size, usually the number of GPUs per node.
"""
parallel = dict(
zero1=8,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
cudnn_deterministic = False
cudnn_benchmark = False
Start Training
----------------
After completing the data preparation and relevant training configurations, you can start the demo training.
The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs.
.. code-block:: bash
srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
Training Results
----------------
Taking the configuration of the demo training on a single machine with 8 GPUs on slurm as an example, the training result log is shown below:
.. code-block:: bash
2023-09-05 11:47:44,649 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:SH-IDC1-10-140-1-110 device: 4
2023-09-05 11:47:44,650 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:SH-IDC1-10-140-1-110 device: 3
2023-09-05 11:47:44,651 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:SH-IDC1-10-140-1-110 device: 6
2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:SH-IDC1-10-140-1-110 device: 7
2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:SH-IDC1-10-140-1-110 device: 5
2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:SH-IDC1-10-140-1-110 device: 1
2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:SH-IDC1-10-140-1-110 device: 2
2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:SH-IDC1-10-140-1-110 device: 0
2023-09-05 11:47:51,006 INFO launch.py:354 in launch -- Distributed environment is initialized, data parallel size: 8, pipeline parallel size: 1, tensor parallel size: 1
2023-09-05 11:49:09,855 INFO hybrid_zero_optim.py:294 in _partition_param_list -- Number of elements on ranks: [894509056, 944865280, 966909952, 966909952, 966909952, 944865280, 966909952, 670068736], rank:0
2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan
2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan
2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan
2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan
2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan
2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan

View File

@ -0,0 +1,18 @@
Training Example
================
7B Demo
------------
.. toctree::
:maxdepth: 2
7B_demo
30B Demo
------------
.. toctree::
:maxdepth: 2
30B_demo

View File

@ -3,6 +3,7 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
InternLM
========
@ -14,6 +15,14 @@ Environment Setup
install
Quickstart Guide
-------------------
.. toctree::
:maxdepth: 2
usage
Model Setup
-------------------
@ -39,7 +48,7 @@ Parallel Training
parallel
Model Checkpointing
-------------------
--------------------
.. toctree::
:maxdepth: 2
@ -62,6 +71,22 @@ Monitor
monitor
Example
-------------------
.. toctree::
:maxdepth: 2
example/index
Q&A
-------------------
.. toctree::
:maxdepth: 2
qa
Indices and tables
==================

View File

@ -20,16 +20,71 @@ parser with some builtin arguments, users can add custom parameters to this pars
.. autofunction:: internlm.initialize.get_default_parser
.. _InternLM-init:
.. _InternLM-model-init:
Model Initialization
-------------------------
.. autofunction:: internlm.train.initialize_model
InternLM uses the field ``model_type`` and ``model`` in the config file to control model initialization process. An example model initialization configuration
can be defined as follows:
.. code-block:: python
model_type = "INTERNLM" # default is "INTERNLM", used to register classes and modules for model initialization
NUM_ATTENTION_HEAD = 32
VOCAB_SIZE = 103168
HIDDEN_SIZE = 4096
NUM_LAYER = 32
MLP_RATIO = 8 / 3
model = dict(
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
num_attention_heads=NUM_ATTENTION_HEAD,
embed_split_hidden=True,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYER,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
)
- The field ``model_type`` specifics the model type has been registered and to be initialized.
- The parameters in field ``model`` specific the configuration settings during model initialization.
It is worth noting that, users can define new model type, and register model's initialization function by decorater ``@MODEL_INITIALIZER.register_module``, which ``MODEL_INITIALIZER`` is an instantiated object of class ``internlm.util.registry.Registry``, the example is shown as follows.
.. code-block:: python
MODEL_TYPE = "NEW_MODEL"
@MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
def build_new_model_with_cfg(*args, **kwargs):
.. _InternLM-optim-init:
Optimizer Initialization
-------------------------
.. autofunction:: internlm.train.initialize_optimizer
.. _InternLM-dl-init:
Dataloader Initialization
-------------------------
.. autofunction:: internlm.train.get_train_data_loader
.. _InternLM-trainer-init:
Trainer Initialization
-------------------------
.. autofunction:: internlm.initialize.initialize_trainer

View File

@ -1,70 +1,2 @@
## Installation
### Environment Preparation
The required packages and corresponding version are shown as follows:
- Python == 3.10
- GCC == 10.2.0
- MPFR == 4.1.0
- CUDA >= 11.7
- Pytorch >= 1.13.1
- Transformers >= 4.28.0
- Flash-Attention >= v1.0.5
- Apex == 23.05
- GPU with Ampere or Hopper architecture (such as H100, A100)
- Linux OS
After installing the above dependencies, some system environment variables need to be updated:
```bash
export CUDA_PATH={path_of_cuda_11.7}
export GCC_HOME={path_of_gcc_10.2.0}
export MPFR_HOME={path_of_mpfr_4.1.0}
export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
export CC=${GCC_HOME}/bin/gcc
export CXX=${GCC_HOME}/bin/c++
```
### Environment Installation
Clone the project `internlm` and its dependent submodules from the github repository, as follows:
```bash
git clone git@github.com:InternLM/InternLM.git --recurse-submodules
```
It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
```bash
conda create --name internlm-env python=3.10 -y
conda activate internlm-env
cd internlm
pip install -r requirements/torch.txt
pip install -r requirements/runtime.txt
```
Install flash-attention (version v1.0.5):
```bash
cd ./third_party/flash-attention
python setup.py install
cd ./csrc
cd fused_dense_lib && pip install -v .
cd ../xentropy && pip install -v .
cd ../rotary && pip install -v .
cd ../layer_norm && pip install -v .
cd ../../../../
```
Install Apex (version 23.05):
```bash
cd ./third_party/apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
cd ../../
```
### Environment Image
Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
```bash
# pull image
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
# start container
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
docker exec -it myinternlm bash
```{include} ../../en/install.md
```

View File

@ -1,10 +1,26 @@
Monitor and Alert
=================
Monitoring
-----------------
InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` to initialize context monitor. During this time,
a singleton ``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread and track training status
with ``internlm.monitor.monitor.MonitorTracker``.
.. autofunction:: internlm.monitor.monitor.initialize_monitor_manager
.. autoclass:: internlm.monitor.monitor.MonitorManager
:members:
.. autoclass:: internlm.monitor.monitor.MonitorTracker
:members:
Alerting
-----------------
InternLM monitor thread periodically tracks loss spike, potential stuck condition, runtime exception, and SIGTERM signal.
When above situation occurs, an alert will be triggered and a message will be sent to the Feishu webhook address by calling
``internlm.monitor.alert.send_feishu_msg_with_webhook()``
.. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook

View File

@ -1,23 +1,158 @@
Parallel Training
=================
==================
.. 整体说一下并行配置使用方式,接下来再分模块详细说明
.. Brief introduction to training parallelism, and how-to guide about config setting
InternLM supports tensor parallel, pipeline parallel, sequence parallel, data parallel, and ZeRO1.5 to parallelize the training pipeline.
When initializing the distributed environment, we need to specify tensor parallel size, pipeline parallel size, data parallel size,
and ZeRO1.5 strategy.
The parallel setting of InternLM is fully config-driven, and you can change the parallelism by modifying
`config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_. An exmaple parallel training configuration can be defined as follows:
.. code-block:: python
parallel = dict(
zero1=8,
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
- zero1: zero parallel strategy, divided into the following three cases, the default value is -1
- When ``size <= 0``, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
- When ``size == 1``, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
- When ``size > 1`` and ``size <= data_parallel_world_size``, the zero1 process group is a subset of the data parallel process group.
- tensor: tensor parallel size, usually the number of GPUs per node, the default value is 1
- pipeline: pipeline parallel strategy
- size: pipeline parallel size, the default value is 1
- interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
- sequence_parallel: whether to enable sequence parallelism, the default value is False
Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
Tensor Parallel
-----------------
The implementation of tensor parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_, which has tensor
parallel extensions to parallelize `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ and
`linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ blocks in InternLM model.
To use tensor parallel, you need to set the value of tensor parallel size ``parallel.tensor`` in the config file, which is usually the number of GPUs per node.
.. figure:: ../../imgs/tensor_parallel.png
:scale: 50%
:class: with-border
Tensor parallel, adopted from `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_
Pipeline Parallel
-----------------
InternLM uses `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ (one forward pass followed by one backward pass) for pipeline parallel. For 1F1B strategy, there are two implementations:
(1) non-interleaved scheduler, which is memory-efficient (2) interleaved scheduler, which is both memory-efficient and time-efficient.
.. figure:: ../../imgs/pipeline_schedule.png
:scale: 45%
:class: with-border
Non-interleaved and interleaved scheduler for 1F1B pipeline parallelism, adopted from `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_
scheduler for non-interleaved 1F1B strategy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To use non-interleaved pipeline scheduler, you need to set ``model.num_chunks = 1`` in the config file.
.. autoclass:: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
:members:
scheduler for interleaved 1F1B strategy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To use interleaved pipeline scheduler, you need to set ``model.num_chunks > 1`` in the config file.
.. autoclass:: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler
:members:
Also, to enable communication overlap when using interleaved pipeline scheduler, you need to set ``parallel.pipeline.interleaved_overlap = True``
in the config file.
When ``parallel.pipeline.interleaved_overlap = True``, function ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap`` will be called and
``internlm.core.communication.AsynCommunicator`` will be created for managing async communication. Asynchronous communication will be enabled in 1F1B stage to make full
use of uplink/downlink bandwidth and achieve communication overlap.
The difference between 1F1B stage without overlap and 1F1B stage with overlap is shown as follows:
The 1F1B stage without overlap consists of the following steps:
.. code-block:: bash
1. Perform the forward pass.
2. Perform the backward pass.
3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration to the previous stage, and receive the forward and backward inputs for the next iteration.
The 1F1B stage with overlap consists of the following steps:
.. code-block:: bash
1. Perform the forward pass.
2. Check if the backward input is ready.
3. Send the forward output and receive the forward input for the next iteration.
4. Perform the backward pass.
5. Check if the forward input is ready.
6. Send the backward output and receive the backward input for the next iteration.
Sequence Parallel
-----------------
Sequence parallel is a technique to reduce activation memory in layer norm and dropout without additional computation, communication or memory overhead.
The implementation of sequence parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_.
To enable sequence parallel, you need to set ``parallel.sequence_parallel = True`` in the config file.
.. figure:: ../../imgs/sequence_parallel.png
:scale: 50%
:class: with-border
Sequence parallel, adopted from flash-attention
Data Parallel
-----------------
InternLM supports data parallel. For data parallel:
`Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
ZeRO1.5
-----------------
The implementation of ZeRO1.5 uses the concept of hierarchical sharding via config value ``parallel.zero1``, which enables sharding within local nodes.
1. If ``parallel.zero1 <= 0``, the size of the zero process group is equal to the size of the dp process group, so parameters will be divided within the range of dp.
2. If ``parallel.zero1 == 1``, zero is not used, and all dp groups retain the full amount of model parameters.
3. If ``parallel.zero1 > 1`` and ``parallel.zero1 <= dp world size``, the world size of zero is a subset of dp world size. For smaller models, it is usually a better choice to split the parameters within nodes with a setting ``parallel.zero1 <= 8``.
Furthermore, you can enable communication-computation overlap, set bucket reduce size, gradient clipping parameters in the config file.
.. code-block:: python
hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=True,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)
There are two communication optimizations worth paying attention to here:
- overlap_sync_grad: If set True, overlapping training backward pass with gradients' all-reduce communication
- overlap_sync_param: If set True, overlapping parameters' broadcast communication with next step's forward pass
.. autoclass:: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer
:members:

View File

@ -1,11 +1,29 @@
Profiler
========
.. 可介绍torch profiler, memory profiler的使用
.. Mainly about the usage of torch profiler and memory profiler
Torch Profiler
-----------------
InternLM uses ``internlm.train.initialize_llm_profile()`` to profile performance data, execution time duration and breakdown analysis of
step time. The implementation is based on `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ and output tracing files can
be visualized with `tensorboard <https://www.tensorflow.org>`_.
To use this torch profiler tool, you need to enable profiling by passing the ``--profiling`` flag when starting training. After torch
profiling is completed, you can find the profiling results in the ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` folder.
.. autofunction:: internlm.train.initialize_llm_profile
Memory Profiler
-----------------
InternLM provides a practical solution ``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` to monitor actual GPU memory usage.
In the implmentation, model data (including model parameters, model gradients, and optimizer states) and non-model data
(including activations) are calculated.
To use this memory profiler tool, you need to enable profiling by passing the ``--profiling`` flag when starting training. After memory
profiling is completed, you can find the profiling results (including logs of memory usage at different time point and sunburst charts
showing overall memory usage) for a specific rank device in the ``memory_trace/rank{}_dp{}_tp{}`` folder.
.. autoclass:: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
:members:

View File

@ -0,0 +1,2 @@
Q&A
===

View File

@ -1,2 +1,10 @@
Training API
============
InternLM training API is managed in ``internlm.core.trainer.Trainer``. After defining the training engine and runtime scheduler,
we can call training API to perform training, evaluation, zero gradients and parameter update steps.
For detailed usage, please refer to Trainer API documentation and examples.
.. autoclass:: internlm.core.trainer.Trainer
:members:

View File

@ -0,0 +1,4 @@
```{include} ../../en/usage.md
:relative-docs: docs/
:relative-images:
```

View File

@ -1,4 +1,4 @@
## InternLM Installation
## Installation
### Environment Preparation
The required packages and corresponding version are shown as follows:

View File

@ -1,4 +1,4 @@
## Pre-training and Fine-tuning Tutorial for InternLM
## Quickstart Guide for Pre-training and Fine-tuning
To start a demo model training, you need to prepare three things: **installation**, **dataset preparation**, and **model training configuration**. In this guide, we will first cover the steps for dataset preparation and then briefly describe the model training configuration.
@ -93,10 +93,7 @@ data = dict(
)
```
<div align="left">
<img src="../imgs/pack_into_one.png" width="550"/>
</div>
![pack_into_one](../imgs/pack_into_one.png)
Currently, it supports passing the dataset file path `train_folder`, and the file format is required to be as follows:

Binary file not shown.

After

Width:  |  Height:  |  Size: 252 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 170 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

View File

@ -25,13 +25,13 @@ class NonPipelineScheduler(BaseScheduler):
gradient_accumulation_steps(int, optional): the steps of gradient accumulation, 1 for disable
gradient accumulation.
Example:
# this shows an example of customized data_process_func
def data_process_func(dataloader_output):
item1, item2, item3 = dataloader_output
data = (item1, item2)
label = item3
return data, label
Examples:
>>> # this shows an example of customized data_process_func
>>> def data_process_func(dataloader_output):
>>> item1, item2, item3 = dataloader_output
>>> data = (item1, item2)
>>> label = item3
>>> return data, label
"""
def __init__(

View File

@ -1071,8 +1071,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
1. Perform the forward pass.
2. Perform the backward pass.
3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration
to the previous stage,
and receive the forward and backward inputs for the next iteration.
to the previous stage, and receive the forward and backward inputs for the next iteration.
Args:
engine (Engine): The engine to use for computation.

View File

@ -131,10 +131,12 @@ class Trainer:
@property
def engine(self):
"""Returns the engine that responsible for managing the training and evaluation process."""
return self._engine
@property
def schedule(self):
"""Returns the runtime scheduler."""
return self._schedule
@property
@ -143,15 +145,19 @@ class Trainer:
return isinstance(self._schedule, (PipelineScheduler, InterleavedPipelineScheduler))
def train(self):
"""Sets the model to training mode."""
self._engine.train()
def eval(self):
"""Sets the model to evaluation mode."""
self._engine.eval()
def zero_grad(self):
"""Sets the gradient of all parameters in the model to zero."""
self._engine.zero_grad()
def step(self):
"""Executes the parameter update step."""
return self._engine.step()
def execute_schedule(self, data_iter: Iterable, **kwargs):

View File

@ -43,8 +43,8 @@ def initialize_trainer(
loaded into gpc.config.
Args:
model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
optimizer (:class:`BaseOptimizer`.
model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
optimizer (:class:`BaseOptimizer`): Your optimizer for training.
criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.

View File

@ -22,7 +22,7 @@ def get_default_parser():
Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
Returns:
Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
Parser: Returns the parser with the default arguments, the user may add customized arguments into this parser.
"""
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, help="path to the config file")

View File

@ -451,10 +451,9 @@ def build_model_with_cfg(
use_scaled_init: bool = True,
use_swiglu: bool = True,
use_flash_attn: bool = True,
sequence_parallel: bool = False, # pylint: disable=W0613
):
"""
Builde model with config
Build model with config.
Args:
num_chunks (int): The number of partitions in pipeline parallel. 1 by default.

View File

@ -211,6 +211,14 @@ monitor_manager = MonitorManager()
@contextmanager
def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
"""
Initialize monitor manager for monitoring training lifetime and alerting exception info to Feishu.
Args:
job_name (str): The training job name.
alert_address (str): The Feishu webhook address for sending alert messages.
"""
if alert_address is not None:
try:
monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address)

View File

@ -45,9 +45,11 @@ logger = get_logger(__file__)
def initialize_model():
"""
Initialize model.
Initialize model with Automatic Mixed Precision.
Returns: The neural network model to be trained or evaluated.
Returns:
torch.nn.Module:
The neural network model to be trained or evaluated.
"""
model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
@ -88,9 +90,10 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
Initialize optimizer.
Args:
model (torch.nn.Module): Your model instance to be trained or evaluated.
model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated.
Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
Returns:
A tuple of (optimizer, beta2_scheduler, lr_scheduler).
"""
if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
param_bcast_sync_handler = ParamBcastSyncHandler(model)
@ -125,7 +128,14 @@ def get_train_data_loader(
"""
Generate and return the training data loader.
Returns: A tuple of (train_dl, dataset_types).
Args:
num_worker (:class:`int`): number of subprocesses used for dataloader.
dataset_generate_func (:class:`Callable`, optional): generate function for dataset.
train_sampler (:class:`torch.utils.data.sampler`, optional): dataset sampler for training dataloader.
train_collate_fn (:class:`Callable`, optional): collate function for training dataloader.
Returns:
A tuple of (train_dl, dataset_types).
"""
# Get the dataset types