2023-05-24 08:01:26 +00:00
|
|
|
from typing import Any, Callable, Dict, List
|
2023-05-24 02:26:46 +00:00
|
|
|
|
2023-05-22 07:02:17 +00:00
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
2023-06-07 08:09:40 +00:00
|
|
|
from transformers.pytorch_utils import Conv1D
|
2023-05-24 02:26:46 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
from colossalai.cluster.process_group_manager import ProcessGroupManager
|
|
|
|
|
2023-05-24 02:26:46 +00:00
|
|
|
from ..policies.autopolicy import get_autopolicy
|
2023-06-16 03:23:30 +00:00
|
|
|
from ..policies.basepolicy import Policy, SubModuleReplacementDescription
|
|
|
|
from ..utils.utils import getattr_, setattr_
|
2023-05-24 08:01:26 +00:00
|
|
|
from .shard_config import ShardConfig
|
2023-05-22 07:02:17 +00:00
|
|
|
|
2023-05-24 08:01:26 +00:00
|
|
|
__all__ = ['ModelSharder', 'shard_model']
|
2023-05-22 07:02:17 +00:00
|
|
|
|
2023-05-24 02:26:46 +00:00
|
|
|
|
2023-05-22 07:02:17 +00:00
|
|
|
class ModelSharder(object):
|
2023-05-24 02:26:46 +00:00
|
|
|
r"""
|
2023-05-22 07:02:17 +00:00
|
|
|
Shard the original huggingface model according to the policy
|
|
|
|
|
|
|
|
Args:
|
2023-05-24 02:26:46 +00:00
|
|
|
policy (:class:`Policy`): The policy to shard the model
|
|
|
|
model (:class:`torch.Module`): The model to shard
|
|
|
|
shard_config: The setting of distributed model
|
2023-05-22 07:02:17 +00:00
|
|
|
"""
|
2023-05-24 02:26:46 +00:00
|
|
|
|
2023-05-22 07:02:17 +00:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
model: nn.Module,
|
|
|
|
policy: Policy,
|
2023-05-24 02:26:46 +00:00
|
|
|
shard_config: ShardConfig = None, # TODO
|
2023-06-15 09:55:42 +00:00
|
|
|
pg_manager: ProcessGroupManager = None) -> None:
|
2023-05-22 07:02:17 +00:00
|
|
|
self.model = model
|
|
|
|
self.policy = get_autopolicy(self.model) if policy is None else policy
|
|
|
|
self.shard_config = shard_config
|
2023-06-15 09:55:42 +00:00
|
|
|
self.pg_manager = pg_manager
|
2023-05-22 07:02:17 +00:00
|
|
|
|
|
|
|
def shard(self) -> None:
|
2023-06-15 09:55:42 +00:00
|
|
|
r"""
|
|
|
|
Shard the model according to the policy
|
|
|
|
"""
|
|
|
|
self.policy.set_model(self.model)
|
2023-06-19 02:47:16 +00:00
|
|
|
self.policy.set_shard_config(self.shard_config)
|
2023-06-15 09:55:42 +00:00
|
|
|
self.preprocess()
|
|
|
|
self.replace_model_class()
|
|
|
|
self.replace_module()
|
|
|
|
self.postprocess()
|
2023-05-24 02:26:46 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
def reshape_embedding(self) -> None:
|
2023-06-07 08:09:40 +00:00
|
|
|
r"""
|
|
|
|
Reshape the Embedding layer to make the embedding dimension divisible by world_size
|
|
|
|
"""
|
|
|
|
vocab_size = self.model_config.vocab_size
|
|
|
|
world_size = self.shard_config.world_size
|
|
|
|
if vocab_size % world_size != 0:
|
|
|
|
new_vocab_size = vocab_size + world_size - vocab_size % world_size
|
|
|
|
self.model.resize_token_embeddings(new_vocab_size)
|
|
|
|
self.model_config = self.model.config
|
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
def preprocess(self) -> None:
|
2023-06-19 02:47:16 +00:00
|
|
|
self.model = self.policy.preprocess()
|
2023-06-15 09:55:42 +00:00
|
|
|
|
|
|
|
def postprocess(self) -> None:
|
|
|
|
self.model = self.policy.postprocess()
|
|
|
|
|
2023-06-19 02:47:16 +00:00
|
|
|
def replace_model_class(self) -> None:
|
2023-05-24 02:26:46 +00:00
|
|
|
r"""
|
2023-05-22 07:02:17 +00:00
|
|
|
Replace the model to policy defined model
|
|
|
|
Mainly modify the forward and backward to fit distributed model
|
2023-05-24 02:26:46 +00:00
|
|
|
|
2023-05-22 07:02:17 +00:00
|
|
|
e.g.
|
2023-05-24 02:26:46 +00:00
|
|
|
::
|
2023-05-22 07:02:17 +00:00
|
|
|
BertForMaskedLM.forward -> BertForMaskedLM_.forward
|
|
|
|
"""
|
2023-06-15 09:55:42 +00:00
|
|
|
new_model_class = self.policy.new_model_class()
|
|
|
|
if new_model_class is None:
|
2023-06-07 08:09:40 +00:00
|
|
|
return
|
2023-05-22 07:02:17 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
for key in new_model_class.__dict__.keys():
|
|
|
|
if hasattr(self.model.__class__, key):
|
|
|
|
setattr(
|
|
|
|
self.model.__class__,
|
|
|
|
key,
|
|
|
|
getattr(new_model_class, key),
|
|
|
|
)
|
2023-05-22 07:02:17 +00:00
|
|
|
|
2023-06-19 02:47:16 +00:00
|
|
|
def replace_module(self) -> None:
|
2023-05-24 02:26:46 +00:00
|
|
|
r"""
|
2023-06-15 09:55:42 +00:00
|
|
|
Replace the module according to the policy, and replace the module one by one
|
2023-05-22 07:02:17 +00:00
|
|
|
|
|
|
|
Args:
|
2023-06-15 09:55:42 +00:00
|
|
|
model (:class:`torch.nn.Module`): The model to shard
|
2023-05-22 07:02:17 +00:00
|
|
|
"""
|
2023-06-19 02:47:16 +00:00
|
|
|
module_descriptions = self.policy.module_policy()
|
2023-06-15 09:55:42 +00:00
|
|
|
for module_description in module_descriptions.items():
|
|
|
|
origin_layer_cls = module_description[0]
|
|
|
|
attr_replacement = module_description[1].attribute_replacement
|
|
|
|
param_replacement = module_description[1].param_replacement
|
|
|
|
sub_module_replacement = module_description[1].sub_module_replacement
|
|
|
|
self._recursive_replace_layer(self.model, origin_layer_cls, attr_replacement, param_replacement,
|
|
|
|
sub_module_replacement)
|
|
|
|
|
|
|
|
def _recursive_replace_layer(
|
2023-05-24 02:26:46 +00:00
|
|
|
self,
|
2023-06-15 09:55:42 +00:00
|
|
|
module: nn.Module,
|
2023-05-24 02:26:46 +00:00
|
|
|
origin_cls: nn.Module,
|
2023-06-15 09:55:42 +00:00
|
|
|
attr_replacement: Dict[str, Any],
|
|
|
|
param_replacement: List[Callable],
|
|
|
|
sub_module_replacement: List[Callable],
|
2023-05-24 02:26:46 +00:00
|
|
|
) -> None:
|
|
|
|
r"""
|
2023-05-22 07:02:17 +00:00
|
|
|
Reverse the replace layer operation
|
|
|
|
|
|
|
|
Args:
|
2023-05-24 02:26:46 +00:00
|
|
|
layer (:class:`torch.nn.Module`): The object of layer to shard
|
|
|
|
origin_cls (:class:`transformers.model`): The origin layer class
|
2023-06-15 09:55:42 +00:00
|
|
|
attr_replacement (Dict): The attribute dict to modify
|
|
|
|
param_replacement (List[Callable]): The function list to get parameter shard information in polic
|
|
|
|
sub_module_replacement (List[Callable]): The function list to get sub module shard information in policy
|
2023-05-22 07:02:17 +00:00
|
|
|
"""
|
2023-06-15 09:55:42 +00:00
|
|
|
if module.__class__ == origin_cls:
|
|
|
|
self._replace_attr(module, attr_replacement)
|
|
|
|
self._replace_param(module, param_replacement)
|
|
|
|
self._replace_sub_module(module, sub_module_replacement)
|
|
|
|
for name, child in module.named_children():
|
|
|
|
self._recursive_replace_layer(child, origin_cls, attr_replacement, param_replacement,
|
|
|
|
sub_module_replacement)
|
|
|
|
|
|
|
|
def _replace_attr(
|
2023-05-24 02:26:46 +00:00
|
|
|
self,
|
2023-06-15 09:55:42 +00:00
|
|
|
module: nn.Module,
|
|
|
|
attr_replacement: Dict[str, Any],
|
2023-05-24 02:26:46 +00:00
|
|
|
) -> None:
|
|
|
|
r"""
|
2023-06-15 09:55:42 +00:00
|
|
|
Replace the attribute of the layer
|
2023-05-22 07:02:17 +00:00
|
|
|
|
|
|
|
Args:
|
2023-06-15 09:55:42 +00:00
|
|
|
layer (:class:`torch.nn.Module`): The object of layer to shard
|
|
|
|
attr_replacement (Dict): The attribute dict to modify
|
2023-05-22 07:02:17 +00:00
|
|
|
"""
|
2023-06-15 09:55:42 +00:00
|
|
|
for k, v in attr_replacement.items():
|
|
|
|
setattr_(module, k, v, ignore=True)
|
2023-06-12 08:52:18 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
def _replace_param(
|
|
|
|
self,
|
|
|
|
module: nn.Module,
|
|
|
|
param_replacement: List[Callable],
|
|
|
|
) -> None:
|
2023-05-24 02:26:46 +00:00
|
|
|
r"""
|
2023-06-15 09:55:42 +00:00
|
|
|
Replace the parameter of the layer
|
2023-05-22 07:02:17 +00:00
|
|
|
|
|
|
|
Args:
|
2023-06-15 09:55:42 +00:00
|
|
|
layer (:class:`torch.nn.Module`): The object of layer to shard
|
|
|
|
param_replacement (List[Callable]): The function list to get parameter shard information in policy
|
2023-05-22 07:02:17 +00:00
|
|
|
"""
|
2023-06-15 09:55:42 +00:00
|
|
|
# TODO: support parameter shard
|
|
|
|
pass
|
2023-05-22 07:02:17 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
def _replace_sub_module(
|
|
|
|
self,
|
|
|
|
org_layer: nn.Module,
|
2023-06-16 03:23:30 +00:00
|
|
|
sub_module_replacement: List[SubModuleReplacementDescription],
|
2023-06-15 09:55:42 +00:00
|
|
|
) -> None:
|
2023-05-24 02:26:46 +00:00
|
|
|
r"""
|
2023-06-15 09:55:42 +00:00
|
|
|
Shard one layer according to the policy, the layer should be the same class as the key in policy's argument_policy return dict
|
2023-05-22 07:02:17 +00:00
|
|
|
|
|
|
|
Args:
|
2023-06-15 09:55:42 +00:00
|
|
|
org_layer (:class:`torch.nn.Module`): The origin layer object to shard
|
|
|
|
param_funcs (:class:`List[typing.Callable]`): The function list to get shard information in policy class
|
2023-05-24 02:26:46 +00:00
|
|
|
|
|
|
|
"""
|
2023-06-15 09:55:42 +00:00
|
|
|
for description in sub_module_replacement:
|
|
|
|
suffix = description.suffix
|
|
|
|
target_module = description.target_module
|
2023-06-16 08:12:27 +00:00
|
|
|
kwargs = {} if description.kwargs is None else description.kwargs
|
2023-05-24 08:01:26 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
assert target_module is not None, 'target_module should not be None'
|
2023-05-24 08:01:26 +00:00
|
|
|
|
2023-06-16 03:23:30 +00:00
|
|
|
# TODO: support different parallel mode
|
|
|
|
native_sub_module = getattr_(org_layer, suffix)
|
2023-06-16 08:12:27 +00:00
|
|
|
replace_layer = target_module.from_native_module(native_sub_module, self.pg_manager.pg_store['tp1d'],
|
|
|
|
**kwargs)
|
2023-06-16 03:23:30 +00:00
|
|
|
|
2023-06-15 09:55:42 +00:00
|
|
|
setattr_(org_layer, suffix, replace_layer)
|