ColossalAI/colossalai/shardformer/shard/shardformer.py

import torch.nn as nn
from torch.utils.data import Dataset

from colossalai.cluster import DistCoordinator, ProcessGroupManager

from ..policies.basepolicy import Policy
from .shard_config import ShardConfig
from .sharder import ModelSharder


class ShardFormer:
    """
    Parallelize model based on the given config and policy

    Example:

    ```python
    from colossalai.shardformer import ShardFormer, ShardConfig
    from transformers import BertForMaskedLM
    import colossalai
    import torch

    colossalai.launch_from_torch(config={})

    org_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    shard_config = ShardConfig(
        tensor_parallel_size=2,
        tensor_parallel_mode='1d',
    )
    shard_former = ShardFormer(shard_config=shard_config)
    shard_former.init_distributed()
    model = shard_former.shard_model(org_model)
    ```
    """

    def __init__(self, shard_config: ShardConfig):
        """
        Do two things:
        1. Create a colossalai.cluster.process_group_manager to manage process groups for dp, tp and pp
        2. serve as a store for
        """
        self.coordinator = DistCoordinator()
        self.shard_config = shard_config
        self.pg_manager = None

    def init_distributed(self) -> ProcessGroupManager:
        """
        Initialize the distributed process group according to the
        """
        # create process group manager and 1d process group
        # TODO: may need to support other parallel mode when the config has such as field
        pg_manager = ProcessGroupManager()
        pg_manager.create_process_group(name='tp1d', ranks=range(self.coordinator.world_size))
        self.pg_manager = pg_manager

        return pg_manager

    def shard_model(self, model: nn.Module, policy: Policy = None):
        r"""
        The function is used to shard the PyTorch model.

        Args:
            model (`torch.nn.Model`): the origin huggingface model
            shard_config (`ShardConfig`): the config for distribute information
            policy (`Policy`): the custom policy for sharding
        """
        sharder = ModelSharder(model=model, shard_config=self.shard_config, policy=policy, pg_manager=self.pg_manager)
        sharder.shard()
        return model

    def shard_dataset(self, dataset: Dataset):
        """
        Shard dataset for DP
        """
        pass