mirror of https://github.com/InternLM/InternLM
Merge branch 'develop' of https://github.com/InternLM/InternLM into storage_multipart_upload
commit
5b101f2377
|
@ -50,3 +50,20 @@ jobs:
|
|||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py
|
||||
|
||||
check_loss_when_swapping_micro_num_and_micro_bsz:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: check-requirements
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 40
|
||||
steps:
|
||||
- name: mask env
|
||||
run: |
|
||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: loss_tests
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-loss-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_swap_nb_loss_and_gradnorm.py
|
||||
|
|
|
@ -74,3 +74,71 @@ jobs:
|
|||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py
|
||||
|
||||
unit_tests_model_fused_precision:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: check-requirements
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- name: mask env
|
||||
run: |
|
||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: model_fused_precision
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py
|
||||
|
||||
unit_tests_data_batch_sampler:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: check-requirements
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: mask env
|
||||
run: |
|
||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: data_batch_sample
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py
|
||||
|
||||
unit_tests_utils_timeout:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: check-requirements
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- name: mask env
|
||||
run: |
|
||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: utils_timeout
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py
|
||||
|
||||
unit_tests_utils_model_checkpoint:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: check-requirements
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- name: mask env
|
||||
run: |
|
||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: utils_model_checkpoint
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v ./tests/test_utils/test_model_checkpoint.py
|
||||
|
|
|
@ -1,102 +1,115 @@
|
|||
name: weekly-tests
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
- "develop"
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '56 18 * * 5'
|
||||
env:
|
||||
SLURM_PARTITION: llm_s
|
||||
|
||||
jobs:
|
||||
training_8GPU:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: training_8GPU
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
|
||||
|
||||
training_16GPU_8DP2TP:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: training_16GPU_8DP2TP
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
|
||||
|
||||
training_16GPU_8DP2TPSP:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: training_16GPU_8DP2TPSP
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py
|
||||
sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
|
||||
|
||||
training_16GPU_8DP2PP:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: training_16GPU_8DP2PP
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
|
||||
|
||||
training_16GPU_8DP2PP_InterleavedOverlap:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: training_16GPU_8DP2PP_InterleavedOverlap
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py
|
||||
sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
|
||||
|
||||
unit_test_optimizer:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 30
|
||||
timeout-minutes: 35
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: test_optimizer
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py
|
||||
|
||||
unit_test_model:
|
||||
runs-on: [t_cluster]
|
||||
timeout-minutes: 5
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }}
|
||||
|
||||
- name: test_embedding_accuracy
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py
|
||||
|
||||
- name: test_model_internlm_accuracy
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py
|
||||
|
||||
- name: test_norm_accuracy
|
||||
run: |
|
||||
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
|
||||
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py
|
||||
|
|
|
@ -44,8 +44,8 @@ ckpt = dict(
|
|||
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
|
||||
)
|
||||
|
||||
TRAIN_FOLDER = "/path/to/dataset"
|
||||
VALID_FOLDER = "/path/to/dataset"
|
||||
TRAIN_FOLDER = None # "/path/to/dataset"
|
||||
VALID_FOLDER = None # "/path/to/dataset"
|
||||
data = dict(
|
||||
seq_len=SEQ_LEN,
|
||||
# micro_num means the number of micro_batch contained in one gradient update
|
||||
|
@ -64,12 +64,12 @@ data = dict(
|
|||
# each increment. For example, "192 24 8" means that the batch size (micro_num)
|
||||
# starts at 192 and increases by 24 every 8 steps. Defaults to None.
|
||||
# (IMPORTANT): The interval step size is 'micro_bsz'.
|
||||
rampup_batch_size=None,
|
||||
rampup_batch_size="",
|
||||
# Datasets with less than 50 rows will be discarded
|
||||
min_length=50,
|
||||
# train_folder=TRAIN_FOLDER,
|
||||
# valid_folder=VALID_FOLDER,
|
||||
empty_cache_and_diag_interval=10,
|
||||
train_folder=TRAIN_FOLDER,
|
||||
valid_folder=VALID_FOLDER,
|
||||
empty_cache_and_diag_interval=200,
|
||||
diag_outlier_ratio=1.1,
|
||||
)
|
||||
|
||||
|
|
|
@ -35,19 +35,19 @@ def get_tensor_shape():
|
|||
if gpc.config.parallel.sequence_parallel:
|
||||
sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
|
||||
tensor_shape = (
|
||||
gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
|
||||
gpc.config.HIDDEN_SIZE,
|
||||
gpc.config.data["seq_len"] * gpc.config.data["micro_bsz"] // sequence_world_size,
|
||||
gpc.config.model["hidden_size"],
|
||||
)
|
||||
else:
|
||||
tensor_shape = (
|
||||
gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"],
|
||||
gpc.config.HIDDEN_SIZE,
|
||||
gpc.config.data["seq_len"] * gpc.config.data["micro_bsz"],
|
||||
gpc.config.model["hidden_size"],
|
||||
)
|
||||
else:
|
||||
tensor_shape = (
|
||||
gpc.config.data["micro_bsz"],
|
||||
gpc.config.SEQ_LEN,
|
||||
gpc.config.HIDDEN_SIZE,
|
||||
gpc.config.data["seq_len"],
|
||||
gpc.config.model["hidden_size"],
|
||||
)
|
||||
return tensor_shape
|
||||
else:
|
||||
|
|
|
@ -13,6 +13,7 @@ from internlm.core.context import Config
|
|||
from internlm.core.context import global_context as gpc
|
||||
from internlm.monitor import initialize_light_monitor
|
||||
from internlm.utils.common import get_master_node
|
||||
from internlm.utils.gputest import warmup_process_group
|
||||
from internlm.utils.logger import get_logger
|
||||
from internlm.utils.timeout import llm_timeout
|
||||
|
||||
|
@ -60,6 +61,9 @@ def get_default_parser():
|
|||
def args_sanity_check():
|
||||
assert gpc.config is not None, "config is not load!"
|
||||
|
||||
if "JOB_NAME" not in gpc.config:
|
||||
gpc.config._add_item("JOB_NAME", "AnonymousJob")
|
||||
|
||||
# the default model type is INTERNLM
|
||||
if "model_type" not in gpc.config:
|
||||
gpc.config._add_item("model_type", "INTERNLM")
|
||||
|
@ -144,10 +148,6 @@ def args_sanity_check():
|
|||
if "diag_outlier_ratio" not in data:
|
||||
data._add_item("diag_outlier_ratio", 1.1)
|
||||
|
||||
if "rampup_batch_size" not in data or not data.rampup_batch_size or len(data.rampup_batch_size) == 0:
|
||||
bsz = data.micro_num
|
||||
data._add_item("rampup_batch_size", f"{bsz} {bsz} 1")
|
||||
|
||||
data.diag_outlier_ratio = max(1, data.diag_outlier_ratio)
|
||||
|
||||
if gpc.is_rank_for_log():
|
||||
|
@ -424,6 +424,8 @@ def launch(
|
|||
|
||||
gpc.set_seed(seed)
|
||||
|
||||
warmup_process_group()
|
||||
|
||||
if gpc.is_rank_for_log():
|
||||
logger.info(
|
||||
f"Distributed environment is initialized, "
|
||||
|
|
|
@ -101,7 +101,7 @@ def evaluate_on_val_dls(
|
|||
assert total_val_bsz % data_cfg.micro_bsz == 0
|
||||
num_microbatches = total_val_bsz // data_cfg.micro_bsz
|
||||
tensor_shape = torch.Size(
|
||||
[data_cfg.micro_bsz, batch[0]["input_ids"].shape[1], gpc.config.HIDDEN_SIZE]
|
||||
[data_cfg.micro_bsz, batch[0]["input_ids"].shape[1], gpc.config.model["hidden_size"]]
|
||||
)
|
||||
|
||||
with switch_evaluation_pipeline_scheduler(
|
||||
|
|
|
@ -27,10 +27,17 @@ from internlm.utils.common import get_current_device
|
|||
logger = get_logger(__file__)
|
||||
|
||||
|
||||
# Gloabl cuda cache flush counter
|
||||
n_caching_allocator_flushes = 0
|
||||
|
||||
|
||||
def empty_cache_and_diag(batch_count, interval=50):
|
||||
"""empty cuda cache and run diag bench or tests."""
|
||||
if interval <= 0:
|
||||
interval = 50
|
||||
|
||||
cuda_memory_analyze(batch_count, batch_count % int(interval) == 0 or batch_count <= 5)
|
||||
|
||||
if batch_count % int(interval) == 0:
|
||||
# there is no need to do diag on the first batch
|
||||
if batch_count > 0:
|
||||
|
@ -259,3 +266,75 @@ def bench_gpu(use_flash_attn=True):
|
|||
address=gpc.config.monitor.alert.feishu_alert_address,
|
||||
message=msg,
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
Useful utility functions migrated from deepseped.
|
||||
"""
|
||||
|
||||
|
||||
def warmup_process_group():
|
||||
# Prevent OOM from nccl communication.
|
||||
if dist.is_initialized():
|
||||
buffer = torch.ones([64]).cuda()
|
||||
if gpc.is_initialized(ParallelMode.DATA):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.DATA))
|
||||
if gpc.is_initialized(ParallelMode.TENSOR):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.TENSOR))
|
||||
if gpc.is_initialized(ParallelMode.PIPELINE):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.PIPELINE))
|
||||
if gpc.is_initialized(ParallelMode.ZERO1):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.ZERO1))
|
||||
if gpc.is_initialized(ParallelMode.MODEL):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.MODEL))
|
||||
if gpc.is_initialized(ParallelMode.ZERO3_DP):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.ZERO3_DP))
|
||||
if gpc.is_initialized(ParallelMode.EXPERT_DATA):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.EXPERT_DATA))
|
||||
if gpc.is_initialized(ParallelMode.EXPERT):
|
||||
dist.all_reduce(buffer, group=gpc.get_group(ParallelMode.EXPERT))
|
||||
|
||||
dist.barrier()
|
||||
del buffer
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def cuda_memory_analyze(step=0, print_mm_suage=False):
|
||||
global n_caching_allocator_flushes
|
||||
torch.cuda.synchronize()
|
||||
|
||||
g_rank = gpc.get_global_rank()
|
||||
tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
|
||||
pp_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
||||
dp_rank = gpc.get_local_rank(ParallelMode.DATA)
|
||||
rank_id = f"Rank:{g_rank}-tp{tp_rank}-pp{pp_rank}-dp{dp_rank}"
|
||||
|
||||
if print_mm_suage and gpc.get_local_rank(ParallelMode.DATA) == 0:
|
||||
logger.info(
|
||||
f"{rank_id}: Step {step}: "
|
||||
f"Allocated {round(torch.cuda.memory_allocated() / (1024 * 1024 * 1024),4 )} GB, "
|
||||
f"Max_Allocated {round(torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),4)} GB, "
|
||||
f"Reserved {round(torch.cuda.memory_reserved()/ (1024 * 1024 * 1024),4)} GB, "
|
||||
f"Max_Reserved {round(torch.cuda.max_memory_reserved()/ (1024 * 1024 * 1024),4)} GB "
|
||||
)
|
||||
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
# warn user about caching allocator flushes
|
||||
memory_stats = torch.cuda.memory_stats()
|
||||
alloc_retries = memory_stats.get("num_alloc_retries")
|
||||
if alloc_retries is None:
|
||||
alloc_retries = 0
|
||||
if alloc_retries > n_caching_allocator_flushes:
|
||||
retry_count = alloc_retries - n_caching_allocator_flushes
|
||||
if gpc.get_global_rank() == 0:
|
||||
logger.warning(
|
||||
f"{rank_id}: pytorch allocator cache flushes {retry_count} times since last step."
|
||||
"this happens when there is high memory pressure and is detrimental to "
|
||||
"performance. if this is happening frequently consider adjusting "
|
||||
"settings to reduce memory consumption. If you are unable to "
|
||||
"make the cache flushes go away consider adding "
|
||||
"torch.cuda.empty_cache() calls in your training loop to ensure "
|
||||
"that all ranks flush their caches at the same time"
|
||||
)
|
||||
n_caching_allocator_flushes = alloc_retries
|
||||
|
|
|
@ -106,13 +106,13 @@ def main(args):
|
|||
get_tflops_func = partial(
|
||||
get_megatron_flops,
|
||||
checkpoint=gpc.config.model.checkpoint,
|
||||
seq_len=gpc.config.SEQ_LEN,
|
||||
seq_len=gpc.config.data["seq_len"],
|
||||
hidden_size=gpc.config.model.hidden_size,
|
||||
num_layers=gpc.config.model.num_layers,
|
||||
vocab_size=gpc.config.model.vocab_size,
|
||||
global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA),
|
||||
global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
|
||||
mlp_ratio=gpc.config.MLP_RATIO,
|
||||
mlp_ratio=gpc.config.model["mlp_ratio"],
|
||||
)
|
||||
|
||||
# get and broadcast current time
|
||||
|
|
|
@ -11,20 +11,30 @@ from internlm.solver.optimizer.hybrid_zero_optim import HybridZeroOptimizer
|
|||
from internlm.train.utils import create_param_groups
|
||||
from internlm.utils.storage_manager import SingletonMeta
|
||||
|
||||
OSS_NAME = os.environ.get("OSS_BUCKET_NAME")
|
||||
OSS_IP = os.environ.get("OSS_IP")
|
||||
USER = os.environ.get("USER")
|
||||
OSS_NAME = os.environ.get("OSS_BUCKET_NAME", None)
|
||||
OSS_IP = os.environ.get("OSS_IP", None)
|
||||
USER = os.environ.get("USER", None)
|
||||
JOB_NAME = "CI_TEST"
|
||||
LOCAL_SAVE_PATH = "local:local_ckpt"
|
||||
|
||||
BOTO_SAVE_PATH = f"boto3:s3://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
|
||||
BOTO_SAVE_PATH_NO_PRFIX = f"s3://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
|
||||
if OSS_NAME is None or OSS_IP is None:
|
||||
BOTO_SAVE_PATH = None
|
||||
BOTO_SAVE_PATH_NO_PRFIX = None
|
||||
|
||||
VOLC_SAVE_PATH = f"volc:vc://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
|
||||
VOLC_SAVE_PATH_NO_PRFIX = f"vc://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
|
||||
VOLC_SAVE_PATH = None
|
||||
VOLC_SAVE_PATH_NO_PRFIX = None
|
||||
|
||||
ALI_SAVE_PATH = f"oss2:ali://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
|
||||
ALI_SAVE_PATH_NO_PRFIX = f"ali://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
|
||||
ALI_SAVE_PATH = None
|
||||
ALI_SAVE_PATH_NO_PRFIX = None
|
||||
else:
|
||||
BOTO_SAVE_PATH = f"boto3:s3://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
|
||||
BOTO_SAVE_PATH_NO_PRFIX = f"s3://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
|
||||
|
||||
VOLC_SAVE_PATH = f"volc:vc://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
|
||||
VOLC_SAVE_PATH_NO_PRFIX = f"vc://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
|
||||
|
||||
ALI_SAVE_PATH = f"oss2:ali://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}"
|
||||
ALI_SAVE_PATH_NO_PRFIX = f"ali://{OSS_NAME}.{OSS_IP}/{USER}/{JOB_NAME}/"
|
||||
|
||||
ASYNC_TMP_FOLDER = "./async_tmp_folder"
|
||||
|
||||
|
@ -32,7 +42,12 @@ ASYNC_TMP_FOLDER = "./async_tmp_folder"
|
|||
# 1B
|
||||
init_config = Config(
|
||||
dict(
|
||||
parallel=dict(zero1=1, pipeline=dict(size=1, interleaved_overlap=False), sequence_parallel=False, tensor=1),
|
||||
parallel=dict(
|
||||
zero1=dict(size=1, fsdp=False),
|
||||
pipeline=dict(size=1, interleaved_overlap=False),
|
||||
sequence_parallel=False,
|
||||
tensor=1,
|
||||
),
|
||||
model_type="INTERNLM",
|
||||
adam=dict(
|
||||
lr=1e-4,
|
||||
|
@ -91,8 +106,9 @@ def init_naive_optim(model):
|
|||
|
||||
|
||||
def init_hybrid_optim(model):
|
||||
params = create_param_groups(model, 0.01)
|
||||
naive_optimizer = torch.optim.AdamW(
|
||||
params=[{"params": model.parameters(), "weight_decay": 0.01}],
|
||||
params=params,
|
||||
lr=1e-4,
|
||||
betas=(0.9, 0.95),
|
||||
eps=1e-8,
|
||||
|
|
|
@ -8,9 +8,8 @@ import torch.distributed as dist
|
|||
from internlm.core.context.parallel_context import Config
|
||||
from internlm.core.trainer import TrainState
|
||||
from internlm.solver.optimizer.hybrid_zero_optim import HybridZeroOptimizer
|
||||
from internlm.utils.common import SingletonMeta
|
||||
from internlm.utils.model_checkpoint import CheckpointManager
|
||||
from internlm.utils.storage_manager import wait_async_upload_finish
|
||||
from internlm.utils.storage_manager import SingletonMeta, wait_async_upload_finish
|
||||
from tests.test_utils.common_fixture import ( # noqa # pylint: disable=unused-import
|
||||
ASYNC_TMP_FOLDER,
|
||||
BOTO_SAVE_PATH,
|
||||
|
@ -32,7 +31,7 @@ ckpt_config_list = [
|
|||
checkpoint_every=0,
|
||||
async_upload=True,
|
||||
async_upload_tmp_folder=ASYNC_TMP_FOLDER,
|
||||
snapshot_ckpt_folder="/".join([BOTO_SAVE_PATH, "snapshot"]),
|
||||
snapshot_ckpt_folder="/".join([BOTO_SAVE_PATH, "snapshot"]) if BOTO_SAVE_PATH is not None else None,
|
||||
oss_snapshot_freq=0,
|
||||
stop_file_path=None,
|
||||
load_model_only_folder=None,
|
||||
|
@ -207,6 +206,9 @@ def test_ckpt_mm(step_info, ckpt_config, init_dist_and_model): # noqa # pylint:
|
|||
ckpt_config.checkpoint_every = checkpoint_every
|
||||
ckpt_config.oss_snapshot_freq = oss_snapshot_freq
|
||||
|
||||
if ckpt_config.save_ckpt_folder is None:
|
||||
return
|
||||
|
||||
bond_return_latest_save_path = partial(
|
||||
return_latest_save_path,
|
||||
ckpt_config.save_ckpt_folder,
|
||||
|
@ -298,12 +300,12 @@ def query_quit_file(rank, world_size=2):
|
|||
ckpt_config = Config(
|
||||
dict(
|
||||
enable_save_ckpt=True,
|
||||
save_ckpt_folder=BOTO_SAVE_PATH,
|
||||
save_ckpt_folder=LOCAL_SAVE_PATH,
|
||||
load_optimizer=True,
|
||||
checkpoint_every=0,
|
||||
async_upload=True,
|
||||
async_upload_tmp_folder=ASYNC_TMP_FOLDER,
|
||||
snapshot_ckpt_folder="/".join([BOTO_SAVE_PATH, "snapshot"]),
|
||||
snapshot_ckpt_folder="/".join([LOCAL_SAVE_PATH, "snapshot"]),
|
||||
oss_snapshot_freq=0,
|
||||
stop_file_path=STOP_FILE_PATH,
|
||||
load_model_only_folder=None,
|
||||
|
|
4
train.py
4
train.py
|
@ -77,13 +77,13 @@ def main(args):
|
|||
get_tflops_func = partial(
|
||||
get_megatron_flops,
|
||||
checkpoint=gpc.config.model.checkpoint,
|
||||
seq_len=gpc.config.SEQ_LEN,
|
||||
seq_len=gpc.config.data["seq_len"],
|
||||
hidden_size=gpc.config.model.hidden_size,
|
||||
num_layers=gpc.config.model.num_layers,
|
||||
vocab_size=gpc.config.model.vocab_size,
|
||||
global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA),
|
||||
global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
|
||||
mlp_ratio=gpc.config.MLP_RATIO,
|
||||
mlp_ratio=gpc.config.model["mlp_ratio"],
|
||||
)
|
||||
|
||||
# get and broadcast current time
|
||||
|
|
Loading…
Reference in New Issue