From 816ecf8e04343b2e41893d272691c62c0308dd26 Mon Sep 17 00:00:00 2001 From: Qu Wenwen Date: Thu, 12 Oct 2023 10:56:59 +0800 Subject: [PATCH] fix moe and zero1 check in args_sanity_check --- configs/7B_MoE4_sft.py | 4 ++-- internlm/initialize/launch.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py index e865e79..50f45be 100644 --- a/configs/7B_MoE4_sft.py +++ b/configs/7B_MoE4_sft.py @@ -150,7 +150,7 @@ pipeline parallel (dict): tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=-1, + zero1=dict(size=-1, fsdp=False), tensor=2, pipeline=dict(size=1, interleaved_overlap=True), sequence_parallel=False, @@ -168,4 +168,4 @@ monitor = dict( ), ) -model_type = "INTERNLM_MoE" \ No newline at end of file +model_type = "INTERNLM_MoE" diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index fead575..b7c9199 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -349,7 +349,7 @@ def args_sanity_check(): assert ( not optim_ckpt.overlap_sync_grad & optim_ckpt.overlap_sync_param ), "not support overlap and moe at the same time" - assert gpc.config.parallel.zero1 == -1, "moe only support zero1, set zero1=-1 can fix this" + assert gpc.config.parallel.zero1.size == -1, "moe only support zero1, set zero1=dict(size=-1,...) can fix this" def launch(