feat(init): add skip args check flag and add zero overlap flag (#222)

* feat(init): add skip args check flag * fix(optim): add param overlap enable flag
2023-08-24 16:44:18 +08:00 · 2023-08-24 16:44:18 +08:00 · 7c820cfa40
parent 9cd1e0314e
commit 7c820cfa40
8 changed files with 88 additions and 51 deletions
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@ -75,7 +75,8 @@ grad_scaler = dict(

 hybrid_zero_optimizer = dict(
    # Enable low_level_optimzer overlap_communication
-    zero_overlap_communication=True,
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
    # bucket size for nccl communication params
    reduce_bucket_size=512 * 1024 * 1024,
    # grad clipping
@ -120,7 +121,7 @@ model = dict(
    num_layers=NUM_LAYER,
    mlp_ratio=MLP_RATIO,
    apply_post_layer_norm=False,
-    dtype="torch.tf32", # dtype could be in "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32",
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
    norm_type="rmsnorm",
    layer_norm_epsilon=1e-5,
    use_flash_attn=True,
--- a/internlm/initialize/init.py
+++ b/internlm/initialize/init.py
@ -1,9 +1,15 @@
 from .initialize_trainer import initialize_trainer
-from .launch import get_default_parser, launch_from_slurm, launch_from_torch
+from .launch import (
+    get_default_parser,
+    initialize_distributed_env,
+    launch_from_slurm,
+    launch_from_torch,
+)

 __all__ = [
    "get_default_parser",
    "initialize_trainer",
    "launch_from_slurm",
    "launch_from_torch",
+    "initialize_distributed_env",
 ]
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -10,6 +10,7 @@ import torch

 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
+from internlm.utils.common import get_master_node
 from internlm.utils.logger import get_logger
 from internlm.utils.storage_manager import init_storage_manager

@ -276,6 +277,19 @@ and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
    if "alert_address" not in gpc.config:
        gpc.config._add_item("alert_address", None)

+    optim_ckpt = gpc.config.hybrid_zero_optimizer
+    if "zero_overlap_communication" in optim_ckpt:
+        # Compatible with the old interfaces.
+        optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
+    if "overlap_sync_grad" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_grad", False)
+    if "overlap_sync_param" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_param", False)
+    if gpc.is_rank_for_log():
+        logger.info(
+            f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
+        )
+

 def launch(
    config: Union[str, Path, Config, Dict],
@ -322,8 +336,6 @@ def launch(
    # init process groups for different parallel modes from config
    gpc.init_parallel_groups()

-    args_sanity_check()
-
    # set cuda device
    if torch.cuda.is_available():
        # if local rank is not given, calculate automatically
@ -376,7 +388,11 @@ def launch_from_slurm(
    )


-def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024):
+def launch_from_torch(
+    config: Union[str, Path, Config, Dict],
+    backend: str = "nccl",
+    seed: int = 1024,
+):
    """A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch

@ -404,3 +420,38 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nc
        backend=backend,
        seed=seed,
    )
+
+
+def initialize_distributed_env(
+    config: str,
+    launcher: str = "slurm",
+    master_port: int = 8888,
+    seed: int = 1024,
+    args_check=True,
+):
+    """
+    Initialize distributed environment for distributed training.
+
+    Args:
+        config (str): Config file path.
+        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
+        master_port (str): The master port for distributed training. 8888 by default.
+        seed (int, optional): Specified random seed for every process. 1024 by default.
+    """
+
+    torch.cuda.empty_cache()
+
+    if launcher == "torch":
+        launch_from_torch(config=config, seed=seed)
+    elif launcher == "slurm":
+        launch_from_slurm(
+            config=config,
+            host=get_master_node(),
+            port=master_port,
+            seed=seed,
+        )
+    else:
+        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
+
+    if args_check:
+        args_sanity_check()
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@ -133,7 +133,7 @@ class MHA(nn.Module):

        if inference_params is None:
            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
-                with torch.cuda.amp.autocast(dtype=torch.float16):
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
                        qkv = qkv.to(torch.bfloat16)
                    context = self.inner_attn(qkv).to(x.dtype)
@ -171,7 +171,7 @@ class MHA(nn.Module):

        if inference_params is None:
            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
-                with torch.cuda.amp.autocast(dtype=torch.float16):
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
                        qkv = qkv.to(torch.bfloat16)
                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -106,9 +106,10 @@ class HybridZeroOptimizer(BaseOptimizer):
        max_scale = grad_scal_cfg.max_scale

        # Zero related args
-        overlap_communication = zero_cfg.zero_overlap_communication
        reduce_bucket_size = zero_cfg.reduce_bucket_size
        clip_grad_norm = zero_cfg.clip_grad_norm
+        self._overlap_sync_grad = zero_cfg.overlap_sync_grad
+        self._overlap_sync_param = zero_cfg.overlap_sync_param

        super().__init__(optim=optimizer)

@ -129,7 +130,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        self._fp32_flat_param_groups_of_current_rank = dict()

        # communication params
-        self._overlap_communication = overlap_communication
+        # self._overlap_communication = overlap_communication
        self._reduce_bucket_size = reduce_bucket_size

        # gradient scaler
@ -161,8 +162,11 @@ class HybridZeroOptimizer(BaseOptimizer):
        )
        self.params_per_rank_id_dict = []
        self._param_bcast_sync_handler = param_bcast_sync_handler
-        if self._overlap_communication:
+        if self._overlap_sync_param:
            assert self._param_bcast_sync_handler is not None
+            self._broadcast_comm_stream = torch.cuda.Stream()
+        else:
+            self._broadcast_comm_stream = torch.cuda.current_stream()

        # iterate over the param group in the optimizer
        # partition these param groups for data parallel training
@ -232,14 +236,14 @@ class HybridZeroOptimizer(BaseOptimizer):

        # initialize communication stream for
        # communication-computation overlapping
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._comm_stream = torch.cuda.Stream()
        else:
            self._comm_stream = torch.cuda.current_stream()

        # reduction hook is only used if overlapping communication
        # if it is stage 1 without overlapping, no hook will be attached
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._attach_reduction_hook()

    @property
@ -273,7 +277,7 @@ class HybridZeroOptimizer(BaseOptimizer):
            global_id = str(i)
            for j in range(len(param.size())):
                global_id = "_".join([global_id, str(param.size()[j])])
-            if self._overlap_communication:
+            if self._overlap_sync_param:
                rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
            else:
                rank_to_go = numel_per_rank.index(min(numel_per_rank))
@ -394,7 +398,7 @@ class HybridZeroOptimizer(BaseOptimizer):
                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)

    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._comm_stream.synchronize()
            self._param_store.clear_grads_of_previous_reduced_params()

@ -517,7 +521,7 @@ class HybridZeroOptimizer(BaseOptimizer):

        # if not overlapping communication (no reduction hook is attached)
        # we need to manually reduce these gradients
-        if not self._overlap_communication:
+        if not self._overlap_sync_grad:
            for group_id in range(len(self._fp16_param_groups)):
                for param in self._fp16_param_groups[group_id]:
                    if param.grad is not None:
@ -532,7 +536,7 @@ class HybridZeroOptimizer(BaseOptimizer):
            groups_norms.append(self._compute_norm_with_stage(group_id=group_id))

        # clear reduced grads
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            # grads in the last bucket is reduced
            self._comm_stream.synchronize()
            self._param_store.clear_grads_of_previous_reduced_params()
@ -641,7 +645,7 @@ class HybridZeroOptimizer(BaseOptimizer):
                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                    fp16_param.data.copy_(fp32_param)

-        with torch.cuda.stream(self._comm_stream):
+        with torch.cuda.stream(self._broadcast_comm_stream):
            self.broadcast_params()

        timer("step").stop()
@ -668,7 +672,7 @@ class HybridZeroOptimizer(BaseOptimizer):
                async_op=True,
            )

-            if self._overlap_communication:
+            if self._overlap_sync_param:
                self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
            else:
                handles.append(handle)
--- a/internlm/train/init.py
+++ b/internlm/train/init.py
@ -1,7 +1,6 @@
 from .training_internlm import (
    get_train_data_loader,
    get_validation_data_loader,
-    initialize_distributed_env,
    initialize_llm_profile,
    initialize_model,
    initialize_optimizer,
@ -12,7 +11,6 @@ from .training_internlm import (
 __all__ = [
    "get_train_data_loader",
    "get_validation_data_loader",
-    "initialize_distributed_env",
    "initialize_llm_profile",
    "initialize_model",
    "initialize_optimizer",
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -10,7 +10,6 @@ import torch.distributed as dist
 from torch import nn
 from torch.utils.data import ConcatDataset, DataLoader

-import internlm
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.naive_amp import NaiveAMPModel
@ -31,7 +30,7 @@ from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
 from internlm.solver.optimizer import HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
-from internlm.utils.common import DummyProfile, get_master_node
+from internlm.utils.common import DummyProfile
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
@ -44,32 +43,6 @@ from internlm.utils.registry import MODEL_INITIALIZER
 logger = get_logger(__file__)


-def initialize_distributed_env(config: str, launcher: str = "slurm", master_port: int = 8888, seed: int = 1024):
-    """
-    Initialize distributed environment for distributed training.
-
-    Args:
-        config (str): Config file path.
-        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
-        master_port (str): The master port for distributed training. 8888 by default.
-        seed (int, optional): Specified random seed for every process. 1024 by default.
-    """
-
-    torch.cuda.empty_cache()
-
-    if launcher == "torch":
-        internlm.launch_from_torch(config=config, seed=seed)
-    elif launcher == "slurm":
-        internlm.launch_from_slurm(
-            config=config,
-            host=get_master_node(),
-            port=master_port,
-            seed=seed,
-        )
-    else:
-        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
-
-
 def initialize_model():
    """
    Initialize model.
@ -119,7 +92,11 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):

    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
-    param_bcast_sync_handler = ParamBcastSyncHandler(model)
+    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
+        param_bcast_sync_handler = ParamBcastSyncHandler(model)
+    else:
+        param_bcast_sync_handler = None
+
    adam_cfg = gpc.config.adam
    naive_optimizer = torch.optim.AdamW(
        params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
--- a/train.py
+++ b/train.py
@ -15,6 +15,7 @@ from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.scheduler import SchedulerMetricHook
 from internlm.core.trainer import TrainState
+from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
 from internlm.model.metrics import AccPerplex
 from internlm.monitor import initialize_monitor_manager, send_alert_message
@ -22,7 +23,6 @@ from internlm.monitor.monitor import monitor_manager as mm
 from internlm.train import (
    get_train_data_loader,
    get_validation_data_loader,
-    initialize_distributed_env,
    initialize_llm_profile,
    initialize_model,
    initialize_optimizer,