ColossalAI/colossalai/inference/modeling/layers/baichuan_tp_linear.py

from typing import List, Union

import torch.nn as nn
from torch.distributed import ProcessGroup

from colossalai.shardformer.layer import Linear1D_Col
from colossalai.shardformer.layer.parallel_module import ParallelModule


class BaichuanLMHeadLinear1D_Col(Linear1D_Col):
    @staticmethod
    def from_native_module(
        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
    ) -> ParallelModule:
        module.in_features = module.weight.size(1)
        module.out_features = module.weight.size(0)
        module.bias = None
        module.weight.data = nn.functional.normalize(
            module.weight
        )  # TODO(lry89757) This behavior may not apply to lazy init. When we use lazy init, the weight of shardformer is not the real weight.
        # So we should rewrite our own load_from_state_dict of `BaichuanLMHeadLinear1D_Col` to fix this potential issue.

        return Linear1D_Col.from_native_module(
            module,
            process_group,
            *args,
            **kwargs,
        )
[Inference] Adapt Baichuan2-13B TP (#5659) * adapt to baichuan2 13B * add baichuan2 13B TP * update baichuan tp logic * rm unused code * Fix TP logic * fix alibi slopes tp logic * rm nn.Module * Polished the code. * change BAICHUAN_MODEL_NAME_OR_PATH * Modified the logic for loading Baichuan weights. * fix typos 2024-04-30 07:47:07 +00:00			`from typing import List, Union`

			`import torch.nn as nn`
			`from torch.distributed import ProcessGroup`

			`from colossalai.shardformer.layer import Linear1D_Col`
			`from colossalai.shardformer.layer.parallel_module import ParallelModule`


			`class BaichuanLMHeadLinear1D_Col(Linear1D_Col):`
			`@staticmethod`
			`def from_native_module(`
			`module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], args, *kwargs`
			`) -> ParallelModule:`
			`module.in_features = module.weight.size(1)`
			`module.out_features = module.weight.size(0)`
			`module.bias = None`
[Inference]refactor baichuan (#5791) * refactor baichuan * remove unused code and add TODO for lazyinit 2024-06-11 02:52:01 +00:00			`module.weight.data = nn.functional.normalize(`
			`module.weight`
			`) # TODO(lry89757) This behavior may not apply to lazy init. When we use lazy init, the weight of shardformer is not the real weight.`
			# So we should rewrite our own load_from_state_dict of `BaichuanLMHeadLinear1D_Col` to fix this potential issue.
[Inference] Adapt Baichuan2-13B TP (#5659) * adapt to baichuan2 13B * add baichuan2 13B TP * update baichuan tp logic * rm unused code * Fix TP logic * fix alibi slopes tp logic * rm nn.Module * Polished the code. * change BAICHUAN_MODEL_NAME_OR_PATH * Modified the logic for loading Baichuan weights. * fix typos 2024-04-30 07:47:07 +00:00
			`return Linear1D_Col.from_native_module(`
			`module,`
			`process_group,`
			`*args,`
			`**kwargs,`
			`)`