ColossalAI/colossalai/shardformer/policies/gpt2.py

from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model

import colossalai.shardformer.layer as col_nn

from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription


class GPT2Policy(Policy):

    def preprocess(self):
        # reshape the embedding layer
        r"""
        Reshape the Embedding layer to make the embedding dimension divisible by world_size
        """
        vocab_size = self.model.config.vocab_size
        world_size = self.shard_config.tensor_parallel_size
        if vocab_size % world_size != 0:
            new_vocab_size = vocab_size + world_size - vocab_size % world_size
            self.model.resize_token_embeddings(new_vocab_size)
        return self.model

    def module_policy(self):
        return {
            GPT2Model:
                ModulePolicyDescription(attribute_replacement={},
                                        param_replacement=[],
                                        sub_module_replacement=[
                                            SubModuleReplacementDescription(
                                                suffix="wte",
                                                target_module=col_nn.VocabParallelEmbedding1D,
                                            ),
                                        ]),
            GPT2Block:
                ModulePolicyDescription(attribute_replacement={
                    "attn.embed_dim": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                    "attn.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                    "attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
                },
                                        param_replacement=[],
                                        sub_module_replacement=[
                                            SubModuleReplacementDescription(
                                                suffix="attn.c_attn",
                                                target_module=col_nn.LinearConv1D_Col,
                                                kwargs={
                                                    "n_cast": 3,
                                                },
                                            ),
                                            SubModuleReplacementDescription(
                                                suffix="attn.c_proj",
                                                target_module=col_nn.LinearConv1D_Row,
                                                kwargs={
                                                    "n_cast": 1,
                                                },
                                            ),
                                            SubModuleReplacementDescription(
                                                suffix="mlp.c_fc",
                                                target_module=col_nn.LinearConv1D_Col,
                                                kwargs={
                                                    "n_cast": 1,
                                                },
                                            ),
                                            SubModuleReplacementDescription(
                                                suffix="mlp.c_proj",
                                                target_module=col_nn.LinearConv1D_Row,
                                                kwargs={
                                                    "n_cast": 1,
                                                },
                                            ),
                                            SubModuleReplacementDescription(
                                                suffix="attn.attn_dropout",
                                                target_module=col_nn.Dropout1D,
                                            ),
                                            SubModuleReplacementDescription(
                                                suffix="attn.resid_dropout",
                                                target_module=col_nn.Dropout1D,
                                            ),
                                            SubModuleReplacementDescription(
                                                suffix="mlp.dropout",
                                                target_module=col_nn.Dropout1D,
                                            ),
                                        ])
        }

    def new_model_class(self):

        return self.model

    def postprocess(self):
        return self.model


# GPT2Model
class GPT2ModelPolicy(GPT2Policy):

    def __init__(self) -> None:
        super().__init__()
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00			`from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model`

[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`import colossalai.shardformer.layer as col_nn`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00
[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00

			`class GPT2Policy(Policy):`

[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`def preprocess(self):`
			`# reshape the embedding layer`
			`r"""`
			`Reshape the Embedding layer to make the embedding dimension divisible by world_size`
			`"""`
			`vocab_size = self.model.config.vocab_size`
			`world_size = self.shard_config.tensor_parallel_size`
			`if vocab_size % world_size != 0:`
			`new_vocab_size = vocab_size + world_size - vocab_size % world_size`
			`self.model.resize_token_embeddings(new_vocab_size)`
			`return self.model`

			`def module_policy(self):`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00			`return {`
			`GPT2Model:`
[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`ModulePolicyDescription(attribute_replacement={},`
			`param_replacement=[],`
			`sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="wte",`
			`target_module=col_nn.VocabParallelEmbedding1D,`
			`),`
			`]),`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00			`GPT2Block:`
[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`ModulePolicyDescription(attribute_replacement={`
			`"attn.embed_dim": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,`
			`"attn.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,`
			`"attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,`
			`},`
			`param_replacement=[],`
			`sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="attn.c_attn",`
			`target_module=col_nn.LinearConv1D_Col,`
			`kwargs={`
			`"n_cast": 3,`
			`},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="attn.c_proj",`
			`target_module=col_nn.LinearConv1D_Row,`
			`kwargs={`
			`"n_cast": 1,`
			`},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="mlp.c_fc",`
			`target_module=col_nn.LinearConv1D_Col,`
			`kwargs={`
			`"n_cast": 1,`
			`},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="mlp.c_proj",`
			`target_module=col_nn.LinearConv1D_Row,`
			`kwargs={`
			`"n_cast": 1,`
			`},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="attn.attn_dropout",`
			`target_module=col_nn.Dropout1D,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="attn.resid_dropout",`
			`target_module=col_nn.Dropout1D,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="mlp.dropout",`
			`target_module=col_nn.Dropout1D,`
			`),`
			`])`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00			`}`

[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`def new_model_class(self):`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00
[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`return self.model`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00
[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`def postprocess(self):`
			`return self.model`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00

[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`# GPT2Model`
			`class GPT2ModelPolicy(GPT2Policy):`
[shardformer] add gpt2 policy and modify shard and slicer to support (#3883) * add gpt2 policy and modify shard and slicer to support * remove unused code * polish code 2023-06-07 08:09:40 +00:00
[shardformer] add gpt2 test and layer class refactor (#4041) * add gpt2 test and layer class refactor * add dropout in gpt2 policy 2023-06-20 03:45:16 +00:00			`def __init__(self) -> None:`
			`super().__init__()`