Browse Source

[test] add check

colossalchat
hxwang 4 months ago committed by Hongxin Liu
parent
commit
6c39f0b144
  1. 3
      colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
  2. 15
      tests/test_shardformer/test_model/test_shard_deepseek.py

3
colossalai/booster/plugin/moe_hybrid_parallel_plugin.py

@ -73,6 +73,9 @@ class MoeHybridParallelZeroOptimizer(HybridParallelZeroOptimizer):
moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
}
if len(pg_param_list[dp_process_group]) == 0 or len(pg_param_list[moe_dp_group]) == 0:
raise ValueError("No parameters found in dp_process_group or moe_dp_group")
super().__init__(
model=model,
optimizer=optimizer,

15
tests/test_shardformer/test_model/test_shard_deepseek.py

@ -17,29 +17,27 @@ from colossalai.testing.random import seed_all
from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
NUM_BATCH = 8
NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
NUM_TOK_PER_BATCH, NUM_EXPERTS = 4000, 2
NUM_LAYERS = 4
HIDDEN_SIZE_PER_HEAD = 4
NUM_HEADS = 4
TOP_K = 1
TOP_K = 2
CHECKED_CONFIG = [ # FOR_WORLD=8
(2, 1, 1, 4, 1),
(4, 1, 1, 2, 1),
(4, 1, 1, 1, 1),
(2, 1, 2, 1, 1),
]
@parameterize(
"config",
[
# (2, 1, 2, 1, 1), # TODO debug deepseek pp
# (2, 1, 2, 2, 1), # TODO debug deepseek pp
(2, 1, 2, 1, 1),
# (2, 1, 1, 2, 1),
(2, 1, 1, 1, 2),
# (2, 1, 4, 1, 1), # TODO debug deepseek pp
# (4, 1, 2, 1, 1), # TODO debug deepseek pp
# (2, 1, 1, 1, 2),
],
)
def run_zero_with_original_model(config: Tuple[int, ...]):
@ -69,13 +67,12 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
booster = Booster(plugin=plugin)
assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
# config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
config = AutoConfig.from_pretrained(
"deepseek-ai/deepseek-moe-16b-base",
hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
moe_intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
num_hidden_layers=2,
num_hidden_layers=4,
num_attention_heads=NUM_HEADS,
num_key_value_heads=NUM_HEADS,
first_k_dense_replace=1,

Loading…
Cancel
Save