diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py index e865e79..92a93d0 100644 --- a/configs/7B_MoE4_sft.py +++ b/configs/7B_MoE4_sft.py @@ -4,7 +4,7 @@ DO_ALERT = False SEQ_LEN = 2048 HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 +MLP_RATIO = 4 / 3 NUM_LAYER = 32 VOCAB_SIZE = 103168 @@ -30,6 +30,14 @@ ckpt = dict( # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), + # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering + # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) + # with an automatic restart mechanism upon training reboot. + # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint + # path specified in `load_ckpt_info` by default. + # If you want to initialize your model weights from another model, you must set `auto_resume` to False. + # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. + auto_resume=True, checkpoint_every=CHECKPOINT_EVERY, async_upload=True, # async ckpt upload. (only work for boto3 ckpt) async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. @@ -43,7 +51,7 @@ data = dict( # micro_num means the number of micro_batch contained in one gradient update micro_num=4, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, + micro_bsz=2, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -81,8 +89,8 @@ grad_scaler = dict( hybrid_zero_optimizer = dict( # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, + overlap_sync_grad=False, + overlap_sync_param=False, # bucket size for nccl communication params reduce_bucket_size=512 * 1024 * 1024, # grad clipping @@ -133,7 +141,7 @@ model = dict( layer_norm_epsilon=1e-5, use_flash_attn=True, num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. - num_experts=4, + num_experts=8, moe_use_residual=False, moe_gate_k=2, ) @@ -150,8 +158,8 @@ pipeline parallel (dict): tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=-1, - tensor=2, + zero1=dict(size=-1, fsdp=False), + tensor=1, pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=False, ) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index fead575..2087ae4 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -349,7 +349,7 @@ def args_sanity_check(): assert ( not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param ), "not support overlap and moe at the same time" - assert gpc.config.parallel.zero1 == -1, "moe only support zero1, set zero1=-1 can fix this" + assert gpc.config.parallel.zero1.size == -1, "moe only support zero1, set zero1=-1 can fix this" def launch( diff --git a/internlm/utils/simple_memory_profiler.py b/internlm/utils/simple_memory_profiler.py index 9caf0a2..8a688ed 100644 --- a/internlm/utils/simple_memory_profiler.py +++ b/internlm/utils/simple_memory_profiler.py @@ -424,7 +424,9 @@ class SimpleMemoryProfiler: layer_name, output.element_size() * output.nelement(), flush=False ) - def _activation_trace_hook_forward(self, chunk_id: int, model: Any, inputs: Any, output: torch.Tensor) -> None: + def _activation_trace_hook_forward( + self, chunk_id: int, model: Any, inputs: Any, output: Any # pylint: disable=W0613 + ) -> None: """ Hook function to trace the activation memory usage for a forward pass. @@ -437,7 +439,6 @@ class SimpleMemoryProfiler: None """ del model, inputs - assert isinstance(output, torch.Tensor), f"invalid output type: {type(output)}" if self._stoped: return