2022-12-28 05:37:40 +00:00
|
|
|
from typing import Callable, List
|
2022-11-04 02:55:09 +00:00
|
|
|
|
|
|
|
import torch
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
from colossalai.auto_parallel.tensor_shard.sharding_strategy import OperationData, ShardingStrategy, TrainCycleItem
|
2022-11-04 02:55:09 +00:00
|
|
|
from colossalai.tensor.sharding_spec import ShardingSpec
|
|
|
|
|
2023-01-03 02:30:15 +00:00
|
|
|
from .constants import INPLACE_MODULE, INPLACE_OPS, NO_SAVE_ACTIVATION
|
2022-11-04 02:55:09 +00:00
|
|
|
from .registry import meta_register
|
|
|
|
|
2023-09-19 06:20:26 +00:00
|
|
|
__all__ = ["ShardMetaInfo"]
|
2022-11-04 02:55:09 +00:00
|
|
|
|
|
|
|
|
2023-04-04 09:40:45 +00:00
|
|
|
class ShardMetaInfo:
|
|
|
|
"""ShardMetaInfo class
|
2022-11-04 02:55:09 +00:00
|
|
|
This class is used to store meta info based on sharding strategy and the given
|
|
|
|
target function.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, strategy: ShardingStrategy = None, target: Callable = None) -> None:
|
|
|
|
# compute cost of forward and backward computation
|
|
|
|
self.compute_cost: TrainCycleItem
|
|
|
|
|
|
|
|
# compute memory cost of forward and backward phase
|
|
|
|
self.memory_cost: TrainCycleItem
|
|
|
|
|
|
|
|
# list of input tensors
|
2022-12-28 05:37:40 +00:00
|
|
|
self.fwd_in: List[torch.Tensor]
|
2022-11-04 02:55:09 +00:00
|
|
|
|
2022-12-28 05:37:40 +00:00
|
|
|
# list of buffer tensors
|
|
|
|
self.fwd_buffer: List[torch.Tensor]
|
|
|
|
|
|
|
|
# list of output tensors
|
|
|
|
self.fwd_out: List[torch.Tensor]
|
2022-12-04 07:18:51 +00:00
|
|
|
|
2022-11-04 02:55:09 +00:00
|
|
|
# sharding strategy
|
|
|
|
self._strategy = strategy
|
|
|
|
|
|
|
|
# target function
|
|
|
|
self._target = target
|
|
|
|
|
2023-04-04 09:40:45 +00:00
|
|
|
# compute shard_metainfo if possible
|
2022-11-04 02:55:09 +00:00
|
|
|
if self._strategy is not None and self._target is not None:
|
2023-04-04 09:40:45 +00:00
|
|
|
self.compute_shard_metainfo()
|
2022-11-04 02:55:09 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def strategy(self) -> ShardingStrategy:
|
|
|
|
return self._strategy
|
|
|
|
|
|
|
|
@property
|
|
|
|
def target(self) -> Callable:
|
|
|
|
return self._target
|
|
|
|
|
|
|
|
@strategy.setter
|
|
|
|
def strategy(self, strategy: ShardingStrategy) -> None:
|
|
|
|
self._strategy = strategy
|
|
|
|
if self._strategy is not None and self._target is not None:
|
2023-04-04 09:40:45 +00:00
|
|
|
self.compute_shard_metainfo()
|
2022-11-04 02:55:09 +00:00
|
|
|
|
|
|
|
@target.setter
|
|
|
|
def target(self, target: Callable) -> None:
|
|
|
|
self._target = target
|
|
|
|
if self._strategy is not None and self._target is not None:
|
2023-04-04 09:40:45 +00:00
|
|
|
self.compute_shard_metainfo()
|
2022-11-04 02:55:09 +00:00
|
|
|
|
2023-03-30 09:47:24 +00:00
|
|
|
def compute_sharded_opdata(self, operation_data: OperationData, sharding_spec: ShardingSpec):
|
2022-11-04 02:55:09 +00:00
|
|
|
"""
|
2023-01-02 08:25:18 +00:00
|
|
|
Compute sharded opdata based on the given data and sharding spec.
|
2022-11-04 02:55:09 +00:00
|
|
|
"""
|
2023-03-30 09:47:24 +00:00
|
|
|
|
|
|
|
if isinstance(sharding_spec, ShardingSpec):
|
2023-09-19 06:20:26 +00:00
|
|
|
op_data = OperationData(
|
|
|
|
name=operation_data.name,
|
|
|
|
data=torch.zeros(sharding_spec.get_sharded_shape_per_device(), device="meta"),
|
|
|
|
type=operation_data.type,
|
|
|
|
logical_shape=operation_data.logical_shape,
|
|
|
|
)
|
2023-03-30 09:47:24 +00:00
|
|
|
elif isinstance(sharding_spec, (list, tuple)):
|
|
|
|
data = operation_data.data
|
|
|
|
assert isinstance(data, (list, tuple)), f"Data Should be list or tuple, but got {type(data)}."
|
|
|
|
assert len(data) == len(sharding_spec), f"Length of data and sharding spec should be the same."
|
|
|
|
sharded_data = []
|
|
|
|
for d, s in zip(data, sharding_spec):
|
|
|
|
sharded_data.append(torch.zeros(s.get_sharded_shape_per_device(), device="meta"))
|
|
|
|
op_data = OperationData(name=operation_data.name, data=sharded_data, type=operation_data.type)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"Sharding spec should be ShardingSpec or list, but got {type(sharding_spec)}.")
|
|
|
|
|
|
|
|
return op_data
|
2022-11-04 02:55:09 +00:00
|
|
|
|
2023-04-04 09:40:45 +00:00
|
|
|
def compute_shard_metainfo(self):
|
2022-11-04 02:55:09 +00:00
|
|
|
"""
|
|
|
|
Compute meta info based on sharding strategy and the given target function.
|
|
|
|
"""
|
2023-09-19 06:20:26 +00:00
|
|
|
assert meta_register.has(self._target.__class__) or meta_register.has(
|
|
|
|
self._target
|
|
|
|
), f"Meta info for {self._target} is not registered."
|
2022-12-28 05:37:40 +00:00
|
|
|
if meta_register.has(self._target.__class__):
|
2022-11-23 06:12:34 +00:00
|
|
|
# module
|
|
|
|
meta_func = meta_register.get(self._target.__class__)
|
2022-12-04 07:18:51 +00:00
|
|
|
|
2022-12-28 05:37:40 +00:00
|
|
|
# check whether the target in the list that we don't need to save activation
|
|
|
|
save_fwd_in = self._target.__class__ not in NO_SAVE_ACTIVATION
|
|
|
|
else:
|
2022-11-23 06:12:34 +00:00
|
|
|
# function
|
|
|
|
meta_func = meta_register.get(self._target)
|
2022-11-04 02:55:09 +00:00
|
|
|
|
2022-12-28 05:37:40 +00:00
|
|
|
# check whether the target in the list that we don't need to save activation
|
|
|
|
save_fwd_in = self._target.__class__ not in NO_SAVE_ACTIVATION
|
2022-12-04 07:18:51 +00:00
|
|
|
|
2022-11-04 02:55:09 +00:00
|
|
|
# construct args for meta_func
|
2023-01-02 08:25:18 +00:00
|
|
|
args = [self.compute_sharded_opdata(k, v) for k, v in self._strategy.sharding_specs.items()]
|
2022-11-04 02:55:09 +00:00
|
|
|
|
2022-11-16 15:12:31 +00:00
|
|
|
# construct kwargs
|
|
|
|
if self.target in INPLACE_MODULE:
|
2023-09-19 06:20:26 +00:00
|
|
|
kwargs = {"inplace": self.target.inplace}
|
2023-01-03 02:30:15 +00:00
|
|
|
elif self.target in INPLACE_OPS:
|
2023-09-19 06:20:26 +00:00
|
|
|
kwargs = {"inplace": True}
|
2022-11-16 15:12:31 +00:00
|
|
|
else:
|
2023-09-19 06:20:26 +00:00
|
|
|
kwargs = {"inplace": False}
|
2022-11-16 15:12:31 +00:00
|
|
|
|
2022-11-04 02:55:09 +00:00
|
|
|
# compute metainfo with meta_func
|
2022-12-28 05:37:40 +00:00
|
|
|
self.compute_cost, self.memory_cost, self.fwd_in, self.fwd_buffer, self.fwd_out = meta_func(*args, **kwargs)
|
|
|
|
|
|
|
|
# process corner case for NO_SAVE_ACTIVATION
|
|
|
|
if not save_fwd_in:
|
|
|
|
self.fwd_in = []
|