mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
1.3 KiB
42 lines
1.3 KiB
from colossalai.context import ParallelMode
|
|
from colossalai.nn.layer import WrappedDropout as Dropout
|
|
|
|
|
|
def moe_sa_args(d_model: int,
|
|
n_heads: int,
|
|
d_kv: int,
|
|
attention_drop: float = 0,
|
|
drop_rate: float = 0,
|
|
bias: bool = True):
|
|
"""This is an example for args in moe self attention, since lots of modules should be
|
|
adapted before putting them in experts.
|
|
"""
|
|
dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR)
|
|
dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
|
|
return dict(
|
|
d_model=d_model,
|
|
n_heads=n_heads,
|
|
d_kv=d_kv,
|
|
bias=bias,
|
|
dropout1=dropout1,
|
|
dropout2=dropout2
|
|
)
|
|
|
|
|
|
def moe_mlp_args(d_model: int,
|
|
d_ff: int,
|
|
drop_rate: float,
|
|
bias: bool = True):
|
|
"""This is an example for args of MLP in Experts, since lots of modules should be adapted
|
|
before putting them in experts.
|
|
"""
|
|
dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
|
|
dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
|
|
return dict(
|
|
d_model=d_model,
|
|
d_ff=d_ff,
|
|
bias=bias,
|
|
dropout1=dropout1,
|
|
dropout2=dropout2
|
|
)
|