mirror of https://github.com/hpcaitech/ColossalAI
[test] add check
parent
b2952a5982
commit
6c39f0b144
|
@ -73,6 +73,9 @@ class MoeHybridParallelZeroOptimizer(HybridParallelZeroOptimizer):
|
||||||
moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
|
moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(pg_param_list[dp_process_group]) == 0 or len(pg_param_list[moe_dp_group]) == 0:
|
||||||
|
raise ValueError("No parameters found in dp_process_group or moe_dp_group")
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
model=model,
|
model=model,
|
||||||
optimizer=optimizer,
|
optimizer=optimizer,
|
||||||
|
|
|
@ -17,29 +17,27 @@ from colossalai.testing.random import seed_all
|
||||||
from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
|
from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
|
||||||
|
|
||||||
NUM_BATCH = 8
|
NUM_BATCH = 8
|
||||||
NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
|
NUM_TOK_PER_BATCH, NUM_EXPERTS = 4000, 2
|
||||||
NUM_LAYERS = 4
|
NUM_LAYERS = 4
|
||||||
HIDDEN_SIZE_PER_HEAD = 4
|
HIDDEN_SIZE_PER_HEAD = 4
|
||||||
NUM_HEADS = 4
|
NUM_HEADS = 4
|
||||||
TOP_K = 1
|
TOP_K = 2
|
||||||
|
|
||||||
|
|
||||||
CHECKED_CONFIG = [ # FOR_WORLD=8
|
CHECKED_CONFIG = [ # FOR_WORLD=8
|
||||||
(2, 1, 1, 4, 1),
|
(2, 1, 1, 4, 1),
|
||||||
(4, 1, 1, 2, 1),
|
(4, 1, 1, 2, 1),
|
||||||
(4, 1, 1, 1, 1),
|
(4, 1, 1, 1, 1),
|
||||||
|
(2, 1, 2, 1, 1),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@parameterize(
|
@parameterize(
|
||||||
"config",
|
"config",
|
||||||
[
|
[
|
||||||
# (2, 1, 2, 1, 1), # TODO debug deepseek pp
|
(2, 1, 2, 1, 1),
|
||||||
# (2, 1, 2, 2, 1), # TODO debug deepseek pp
|
|
||||||
# (2, 1, 1, 2, 1),
|
# (2, 1, 1, 2, 1),
|
||||||
(2, 1, 1, 1, 2),
|
# (2, 1, 1, 1, 2),
|
||||||
# (2, 1, 4, 1, 1), # TODO debug deepseek pp
|
|
||||||
# (4, 1, 2, 1, 1), # TODO debug deepseek pp
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def run_zero_with_original_model(config: Tuple[int, ...]):
|
def run_zero_with_original_model(config: Tuple[int, ...]):
|
||||||
|
@ -69,13 +67,12 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
|
||||||
booster = Booster(plugin=plugin)
|
booster = Booster(plugin=plugin)
|
||||||
|
|
||||||
assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
|
assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
|
||||||
# config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
|
|
||||||
config = AutoConfig.from_pretrained(
|
config = AutoConfig.from_pretrained(
|
||||||
"deepseek-ai/deepseek-moe-16b-base",
|
"deepseek-ai/deepseek-moe-16b-base",
|
||||||
hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
|
hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
|
||||||
intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
||||||
moe_intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
moe_intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
||||||
num_hidden_layers=2,
|
num_hidden_layers=4,
|
||||||
num_attention_heads=NUM_HEADS,
|
num_attention_heads=NUM_HEADS,
|
||||||
num_key_value_heads=NUM_HEADS,
|
num_key_value_heads=NUM_HEADS,
|
||||||
first_k_dense_replace=1,
|
first_k_dense_replace=1,
|
||||||
|
|
Loading…
Reference in New Issue