ColossalAI/colossalai/shardformer/policies/t5.py

from colossalai.shardformer.layer import (
    DropoutForParallelInput,
    Embedding1D,
    FusedRMSNorm,
    Linear1D_Col,
    Linear1D_Row,
    VocabParallelEmbedding1D,
)
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription

from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

__all__ = ["T5ModelPolicy", "T5ForConditionalGenerationPolicy", "T5EncoderPolicy"]


class T5BasePolicy(Policy):

    def config_sanity_check(self):
        pass

    def preprocess(self):
        # reshape the embedding layer
        r"""
        Reshape the Embedding layer to make the embedding dimension divisible by world_size
        """
        if self.shard_config.enable_tensor_parallelism:
            vocab_size = self.model.config.vocab_size
            world_size = self.shard_config.tensor_parallel_size
            if vocab_size % world_size != 0:
                new_vocab_size = vocab_size + world_size - vocab_size % world_size
                self.model.resize_token_embeddings(new_vocab_size)
        return self.model

    def module_policy(self):
        from transformers.models.t5.modeling_t5 import (
            T5Attention,
            T5DenseActDense,
            T5DenseGatedActDense,
            T5LayerCrossAttention,
            T5LayerFF,
            T5LayerSelfAttention,
            T5Stack,
        )

        policy = {}

        if self.shard_config.enable_tensor_parallelism:
            policy[T5Stack] = ModulePolicyDescription(sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="dropout",
                    target_module=DropoutForParallelInput,
                ),
                SubModuleReplacementDescription(
                    suffix="embed_tokens",
                    target_module=VocabParallelEmbedding1D,
                )
            ])
            policy[T5LayerSelfAttention] = ModulePolicyDescription(sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="dropout",
                    target_module=DropoutForParallelInput,
                ),
            ])
            policy[T5LayerCrossAttention] = ModulePolicyDescription(sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="dropout",
                    target_module=DropoutForParallelInput,
                )
            ])
            policy[T5Attention] = ModulePolicyDescription(attribute_replacement={
                "d_model":
                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
                "n_heads":
                    self.model.config.num_heads // self.shard_config.tensor_parallel_size,
                "inner_dim":
                    self.model.config.num_heads * self.model.config.d_kv // self.shard_config.tensor_parallel_size
            },
                                                          sub_module_replacement=[
                                                              SubModuleReplacementDescription(
                                                                  suffix="q",
                                                                  target_module=Linear1D_Col,
                                                              ),
                                                              SubModuleReplacementDescription(
                                                                  suffix="k",
                                                                  target_module=Linear1D_Col,
                                                              ),
                                                              SubModuleReplacementDescription(
                                                                  suffix="v",
                                                                  target_module=Linear1D_Col,
                                                              ),
                                                              SubModuleReplacementDescription(
                                                                  suffix="o",
                                                                  target_module=Linear1D_Row,
                                                              ),
                                                              SubModuleReplacementDescription(
                                                                  suffix="relative_attention_bias",
                                                                  target_module=Embedding1D,
                                                                  kwargs=dict(gather_output=False),
                                                                  ignore_if_not_exist=True)
                                                          ])
            policy[T5LayerFF] = ModulePolicyDescription(sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="dropout",
                    target_module=DropoutForParallelInput,
                ),
            ])
            policy[T5DenseGatedActDense] = ModulePolicyDescription(sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="wi_0",
                    target_module=Linear1D_Col,
                ),
                SubModuleReplacementDescription(
                    suffix="wi_1",
                    target_module=Linear1D_Row,
                ),
                SubModuleReplacementDescription(
                    suffix="wo", target_module=Linear1D_Col, kwargs=dict(gather_output=True)),
                SubModuleReplacementDescription(
                    suffix="dropout",
                    target_module=DropoutForParallelInput,
                )
            ])
            policy[T5DenseActDense] = ModulePolicyDescription(sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="wi",
                    target_module=Linear1D_Col,
                ),
                SubModuleReplacementDescription(
                    suffix="wo",
                    target_module=Linear1D_Row,
                ),
                SubModuleReplacementDescription(
                    suffix="dropout",
                    target_module=DropoutForParallelInput,
                )
            ])

        # optimization configuration
        if self.shard_config.enable_fused_normalization:
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="layer_norm",
                target_module=FusedRMSNorm,
            ),
                                                        policy=policy,
                                                        target_key=T5LayerFF)
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="layer_norm",
                target_module=FusedRMSNorm,
            ),
                                                        policy=policy,
                                                        target_key=T5LayerFF)
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="layer_norm", target_module=FusedRMSNorm),
                                                        policy=policy,
                                                        target_key=T5LayerSelfAttention)
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="layer_norm", target_module=FusedRMSNorm),
                                                        policy=policy,
                                                        target_key=T5LayerCrossAttention)
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="final_layer_norm", target_module=FusedRMSNorm),
                                                        policy=policy,
                                                        target_key=T5Stack)
        return policy

    def postprocess(self):
        return self.model


class T5ModelPolicy(T5BasePolicy):

    def module_policy(self):
        from transformers import T5Model
        base_policy = super().module_policy()

        if self.shard_config.enable_tensor_parallelism:
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="shared",
                target_module=VocabParallelEmbedding1D,
            ),
                                                        policy=base_policy,
                                                        target_key=T5Model)
        return base_policy


class T5ForConditionalGenerationPolicy(T5BasePolicy):

    def module_policy(self):
        from transformers import T5ForConditionalGeneration

        policy = super().module_policy()

        if self.shard_config.enable_tensor_parallelism:
            self.append_or_create_submodule_replacement(description=[
                SubModuleReplacementDescription(
                    suffix="shared",
                    target_module=VocabParallelEmbedding1D,
                ),
                SubModuleReplacementDescription(suffix="lm_head",
                                                target_module=Linear1D_Col,
                                                kwargs=dict(gather_output=True))
            ],
                                                        policy=policy,
                                                        target_key=T5ForConditionalGeneration)
        return policy


class T5EncoderPolicy(T5BasePolicy):

    def module_policy(self):
        from transformers import T5EncoderModel

        base_policy = super().module_policy()

        if self.shard_config.enable_tensor_parallelism:
            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
                suffix="shared",
                target_module=VocabParallelEmbedding1D,
            ),
                                                        policy=base_policy,
                                                        target_key=T5EncoderModel)
        return base_policy
[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`from colossalai.shardformer.layer import (`
			`DropoutForParallelInput,`
			`Embedding1D,`
			`FusedRMSNorm,`
			`Linear1D_Col,`
			`Linear1D_Row,`
			`VocabParallelEmbedding1D,`
			`)`
[shardformer] rename policy file name 2023-07-05 07:13:00 +00:00			`from colossalai.shardformer.policies.base_policy import ModulePolicyDescription`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00
[shardformer] rename policy file name 2023-07-05 07:13:00 +00:00			`from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription`
[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00
			`__all__ = ["T5ModelPolicy", "T5ForConditionalGenerationPolicy", "T5EncoderPolicy"]`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00

[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`class T5BasePolicy(Policy):`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00
[shardformer] supported fused normalization (#4112) 2023-06-30 01:32:37 +00:00			`def config_sanity_check(self):`
			`pass`

[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00			`def preprocess(self):`
			`# reshape the embedding layer`
			`r"""`
			`Reshape the Embedding layer to make the embedding dimension divisible by world_size`
			`"""`
[shardformer] support lazy init (#4202) * [shardformer] support lazy init * [shardformer] linear support lazy init * [shardformer] embedding support lazy init * [shardformer] norm support lazy init * [shardformer] fused linear support lazy init * [test] update shardformer test layer * [test] shardformer with lazy init fit ddp * [lazy] hotfix deepcopy of param * [shardformer] fix bert policy and update test * [shardformer] fix bloom policy and update test * [shardformer] fix opt policy and update test * [shardformer] fix t5 policy and update test * [shardformer] fix gpt2 policy and update test * [shardformer] fix llama policy and update test 2023-07-10 02:48:53 +00:00			`if self.shard_config.enable_tensor_parallelism:`
			`vocab_size = self.model.config.vocab_size`
			`world_size = self.shard_config.tensor_parallel_size`
			`if vocab_size % world_size != 0:`
			`new_vocab_size = vocab_size + world_size - vocab_size % world_size`
			`self.model.resize_token_embeddings(new_vocab_size)`
[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00			`return self.model`

			`def module_policy(self):`
[shardformer] import huggingface implicitly (#4101) 2023-06-30 02:56:29 +00:00			`from transformers.models.t5.modeling_t5 import (`
			`T5Attention,`
			`T5DenseActDense,`
			`T5DenseGatedActDense,`
			`T5LayerCrossAttention,`
			`T5LayerFF,`
			`T5LayerSelfAttention,`
			`T5Stack,`
			`)`

[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00			`policy = {}`

			`if self.shard_config.enable_tensor_parallelism:`
			`policy[T5Stack] = ModulePolicyDescription(sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="dropout",`
			`target_module=DropoutForParallelInput,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="embed_tokens",`
[shardformer] support inplace sharding (#4251) * [shardformer] embedding support inplace sharding * [shardformer] linear support inplace sharding * [shardformer] layernorm support inplace sharding * [shardformer] qkv support inplace sharding * [test] update shardformer layer test * [shardformer] fix shared param sharding * [shardformer] fix bert policy * [shardformer] fix bloom policy * [shardformer] fix llama policy * [shardformer] fix opt policy * [shardformer] fix t5 policy * [shardformer] fix fused qkv linear * [shardformer] fix bugs * force sync * [test] fix bugs * [test] fix transformer version 2023-07-20 02:39:06 +00:00			`target_module=VocabParallelEmbedding1D,`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00			`)`
			`])`
			`policy[T5LayerSelfAttention] = ModulePolicyDescription(sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="dropout",`
			`target_module=DropoutForParallelInput,`
			`),`
			`])`
			`policy[T5LayerCrossAttention] = ModulePolicyDescription(sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="dropout",`
			`target_module=DropoutForParallelInput,`
			`)`
			`])`
			`policy[T5Attention] = ModulePolicyDescription(attribute_replacement={`
			`"d_model":`
			`self.model.config.d_model // self.shard_config.tensor_parallel_size,`
			`"n_heads":`
			`self.model.config.num_heads // self.shard_config.tensor_parallel_size,`
			`"inner_dim":`
			`self.model.config.num_heads * self.model.config.d_kv // self.shard_config.tensor_parallel_size`
			`},`
			`sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="q",`
			`target_module=Linear1D_Col,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="k",`
			`target_module=Linear1D_Col,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="v",`
			`target_module=Linear1D_Col,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="o",`
			`target_module=Linear1D_Row,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="relative_attention_bias",`
			`target_module=Embedding1D,`
			`kwargs=dict(gather_output=False),`
			`ignore_if_not_exist=True)`
			`])`
			`policy[T5LayerFF] = ModulePolicyDescription(sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="dropout",`
			`target_module=DropoutForParallelInput,`
			`),`
			`])`
			`policy[T5DenseGatedActDense] = ModulePolicyDescription(sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="wi_0",`
			`target_module=Linear1D_Col,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="wi_1",`
			`target_module=Linear1D_Row,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="wo", target_module=Linear1D_Col, kwargs=dict(gather_output=True)),`
			`SubModuleReplacementDescription(`
			`suffix="dropout",`
			`target_module=DropoutForParallelInput,`
			`)`
			`])`
			`policy[T5DenseActDense] = ModulePolicyDescription(sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="wi",`
			`target_module=Linear1D_Col,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="wo",`
			`target_module=Linear1D_Row,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="dropout",`
			`target_module=DropoutForParallelInput,`
			`)`
			`])`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00
[shardformer] supported fused normalization (#4112) 2023-06-30 01:32:37 +00:00			`# optimization configuration`
			`if self.shard_config.enable_fused_normalization:`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
			`suffix="layer_norm",`
			`target_module=FusedRMSNorm,`
			`),`
			`policy=policy,`
			`target_key=T5LayerFF)`
			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
			`suffix="layer_norm",`
			`target_module=FusedRMSNorm,`
			`),`
			`policy=policy,`
			`target_key=T5LayerFF)`
			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
			`suffix="layer_norm", target_module=FusedRMSNorm),`
			`policy=policy,`
			`target_key=T5LayerSelfAttention)`
			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
			`suffix="layer_norm", target_module=FusedRMSNorm),`
			`policy=policy,`
			`target_key=T5LayerCrossAttention)`
			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
			`suffix="final_layer_norm", target_module=FusedRMSNorm),`
			`policy=policy,`
			`target_key=T5Stack)`
			`return policy`
[shardformer] supported fused normalization (#4112) 2023-06-30 01:32:37 +00:00
[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00			`def postprocess(self):`
			`return self.model`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00

[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`class T5ModelPolicy(T5BasePolicy):`

			`def module_policy(self):`
			`from transformers import T5Model`
			`base_policy = super().module_policy()`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00
			`if self.shard_config.enable_tensor_parallelism:`
			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
[shardformer] refactored some doc and api (#4137) * [shardformer] refactored some doc and api * polish code 2023-07-03 07:29:11 +00:00			`suffix="shared",`
			`target_module=VocabParallelEmbedding1D,`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00			`),`
			`policy=base_policy,`
			`target_key=T5Model)`
[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`return base_policy`


			`class T5ForConditionalGenerationPolicy(T5BasePolicy):`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00
[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00			`def module_policy(self):`
[shardformer] import huggingface implicitly (#4101) 2023-06-30 02:56:29 +00:00			`from transformers import T5ForConditionalGeneration`

[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00			`policy = super().module_policy()`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00
			`if self.shard_config.enable_tensor_parallelism:`
			`self.append_or_create_submodule_replacement(description=[`
			`SubModuleReplacementDescription(`
			`suffix="shared",`
			`target_module=VocabParallelEmbedding1D,`
			`),`
			`SubModuleReplacementDescription(suffix="lm_head",`
			`target_module=Linear1D_Col,`
			`kwargs=dict(gather_output=True))`
			`],`
			`policy=policy,`
			`target_key=T5ForConditionalGeneration)`
[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`return policy`
[shardformer] supported T5 and its variants (#4045) 2023-06-19 09:57:37 +00:00
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00
[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`class T5EncoderPolicy(T5BasePolicy):`
[shardformer] shardformer support t5 model (#3994) test t5 2023-06-15 08:50:08 +00:00
[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`def module_policy(self):`
			`from transformers import T5EncoderModel`

			`base_policy = super().module_policy()`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00
			`if self.shard_config.enable_tensor_parallelism:`
			`self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(`
[shardformer] refactored some doc and api (#4137) * [shardformer] refactored some doc and api * polish code 2023-07-03 07:29:11 +00:00			`suffix="shared",`
			`target_module=VocabParallelEmbedding1D,`
[shardformer] made tensor parallelism configurable (#4144) * [shardformer] made tensor parallelism configurable * polish code 2023-07-04 01:57:03 +00:00			`),`
			`policy=base_policy,`
			`target_key=T5EncoderModel)`
[shardformer] added embedding gradient check (#4124) 2023-06-30 08:16:44 +00:00			`return base_policy`