|
|
@ -17,29 +17,27 @@ from colossalai.testing.random import seed_all
|
|
|
|
from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
|
|
|
|
from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
|
|
|
|
|
|
|
|
|
|
|
|
NUM_BATCH = 8
|
|
|
|
NUM_BATCH = 8
|
|
|
|
NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
|
|
|
|
NUM_TOK_PER_BATCH, NUM_EXPERTS = 4000, 2
|
|
|
|
NUM_LAYERS = 4
|
|
|
|
NUM_LAYERS = 4
|
|
|
|
HIDDEN_SIZE_PER_HEAD = 4
|
|
|
|
HIDDEN_SIZE_PER_HEAD = 4
|
|
|
|
NUM_HEADS = 4
|
|
|
|
NUM_HEADS = 4
|
|
|
|
TOP_K = 1
|
|
|
|
TOP_K = 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHECKED_CONFIG = [ # FOR_WORLD=8
|
|
|
|
CHECKED_CONFIG = [ # FOR_WORLD=8
|
|
|
|
(2, 1, 1, 4, 1),
|
|
|
|
(2, 1, 1, 4, 1),
|
|
|
|
(4, 1, 1, 2, 1),
|
|
|
|
(4, 1, 1, 2, 1),
|
|
|
|
(4, 1, 1, 1, 1),
|
|
|
|
(4, 1, 1, 1, 1),
|
|
|
|
|
|
|
|
(2, 1, 2, 1, 1),
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@parameterize(
|
|
|
|
@parameterize(
|
|
|
|
"config",
|
|
|
|
"config",
|
|
|
|
[
|
|
|
|
[
|
|
|
|
# (2, 1, 2, 1, 1), # TODO debug deepseek pp
|
|
|
|
(2, 1, 2, 1, 1),
|
|
|
|
# (2, 1, 2, 2, 1), # TODO debug deepseek pp
|
|
|
|
|
|
|
|
# (2, 1, 1, 2, 1),
|
|
|
|
# (2, 1, 1, 2, 1),
|
|
|
|
(2, 1, 1, 1, 2),
|
|
|
|
# (2, 1, 1, 1, 2),
|
|
|
|
# (2, 1, 4, 1, 1), # TODO debug deepseek pp
|
|
|
|
|
|
|
|
# (4, 1, 2, 1, 1), # TODO debug deepseek pp
|
|
|
|
|
|
|
|
],
|
|
|
|
],
|
|
|
|
)
|
|
|
|
)
|
|
|
|
def run_zero_with_original_model(config: Tuple[int, ...]):
|
|
|
|
def run_zero_with_original_model(config: Tuple[int, ...]):
|
|
|
@ -69,13 +67,12 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
|
|
|
|
booster = Booster(plugin=plugin)
|
|
|
|
booster = Booster(plugin=plugin)
|
|
|
|
|
|
|
|
|
|
|
|
assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
|
|
|
|
assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
|
|
|
|
# config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
|
|
|
|
|
|
|
|
config = AutoConfig.from_pretrained(
|
|
|
|
config = AutoConfig.from_pretrained(
|
|
|
|
"deepseek-ai/deepseek-moe-16b-base",
|
|
|
|
"deepseek-ai/deepseek-moe-16b-base",
|
|
|
|
hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
|
|
|
|
hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
|
|
|
|
intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
|
|
|
intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
|
|
|
moe_intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
|
|
|
moe_intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
|
|
|
|
num_hidden_layers=2,
|
|
|
|
num_hidden_layers=4,
|
|
|
|
num_attention_heads=NUM_HEADS,
|
|
|
|
num_attention_heads=NUM_HEADS,
|
|
|
|
num_key_value_heads=NUM_HEADS,
|
|
|
|
num_key_value_heads=NUM_HEADS,
|
|
|
|
first_k_dense_replace=1,
|
|
|
|
first_k_dense_replace=1,
|
|
|
|