mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
208 lines
7.5 KiB
208 lines
7.5 KiB
import uuid
|
|
from functools import partial
|
|
|
|
import torch
|
|
import torch.distributed as dist
|
|
from torch.types import _bool, _device, _dtype
|
|
from torch.utils._pytree import tree_flatten, tree_map
|
|
|
|
from ._monkey_patch import _AliasATen, _DistCommMethod, _InplaceATen, _MaybeInplaceATen, _TorchOverrideableFactoryMethod
|
|
|
|
__all__ = ['MetaTensor', 'MetaTensorMode']
|
|
|
|
|
|
def register_storage(r, data_ptr_fn=None):
|
|
if isinstance(r, torch.Tensor):
|
|
if data_ptr_fn is not None:
|
|
r.data_ptr = data_ptr_fn
|
|
elif not r.data_ptr():
|
|
data_ptr = uuid.uuid1()
|
|
r.data_ptr = lambda: data_ptr
|
|
|
|
|
|
def _normalize_tuple(x):
|
|
if not isinstance(x, tuple):
|
|
return (x,)
|
|
return x
|
|
|
|
|
|
# a hack of inplace execution in PyTorch
|
|
def _assert_alias(func):
|
|
return func in (_AliasATen + _InplaceATen + _MaybeInplaceATen # TODO: check if should be this aggressive
|
|
)
|
|
|
|
|
|
class MetaTensor(torch.Tensor):
|
|
"""
|
|
A wrapping tensor that hacks ``torch.autograd`` without patching more ``torch.ops.aten`` ops.
|
|
`device` is the device that ``MetaTensor`` is supposed to run on. Meta tensors give you the
|
|
ability to run PyTorch code without having to actually do computation through tensors
|
|
allocated on a `meta` device. Because the device is `meta`, meta tensors do not model
|
|
device propagation. ``MetaTensor`` extends its usage by carrying an additional `device`
|
|
which tracks devices that would have been used.
|
|
|
|
Reference:
|
|
https://github.com/pytorch/pytorch/blob/master/torch/_subclasses/fake_tensor.py
|
|
"""
|
|
|
|
_tensor: torch.Tensor
|
|
|
|
@staticmethod
|
|
def __new__(cls, elem, device=None, data_ptr_fn=None):
|
|
requires_grad = elem.requires_grad
|
|
# Avoid multiple wrapping
|
|
while isinstance(elem, MetaTensor):
|
|
device = elem.device if device is None else device
|
|
elem = elem._tensor
|
|
|
|
# The wrapping tensor (MetaTensor) shouldn't hold any
|
|
# memory for the class in question, but it should still
|
|
# advertise the same device as before
|
|
r = torch.Tensor._make_wrapper_subclass(
|
|
cls,
|
|
elem.size(),
|
|
strides=elem.stride(),
|
|
storage_offset=elem.storage_offset(),
|
|
dtype=elem.dtype,
|
|
layout=elem.layout,
|
|
device=device or (elem.device if elem.device.type != 'meta' else torch.device('cpu')),
|
|
requires_grad=requires_grad) # deceive the frontend for aten selections
|
|
r._tensor = elem
|
|
# ...the real tensor is held as an element on the tensor.
|
|
if not r._tensor.is_meta:
|
|
val = elem.data_ptr()
|
|
data_ptr_fn = lambda: val
|
|
r._tensor = r._tensor.to(torch.device('meta'))
|
|
|
|
# only tensor not on `meta` should be copied to `meta`
|
|
register_storage(r._tensor, data_ptr_fn)
|
|
if isinstance(elem, torch.nn.Parameter):
|
|
r = torch.nn.Parameter(r)
|
|
return r
|
|
|
|
def __repr__(self):
|
|
name = 'MetaParameter' if getattr(self, '_is_param', False) else 'MetaTensor'
|
|
if self.grad_fn:
|
|
return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
|
|
return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
|
|
|
|
@classmethod
|
|
def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
|
|
device = None
|
|
|
|
def unwrap(x):
|
|
nonlocal device
|
|
if isinstance(x, MetaTensor):
|
|
device = x.device
|
|
x = x._tensor
|
|
elif isinstance(x, torch.Tensor):
|
|
device = x.device
|
|
x = x.to(torch.device('meta'))
|
|
return x
|
|
|
|
args = tree_map(unwrap, args)
|
|
kwargs = tree_map(unwrap, kwargs)
|
|
|
|
if 'device' in kwargs:
|
|
device = kwargs['device']
|
|
kwargs['device'] = torch.device('meta')
|
|
|
|
# run aten for backend=CPU but actually on backend=Meta
|
|
# here we detect whether or not the execution generates a physical copy
|
|
# of the input tensor
|
|
ret = func(*args, **kwargs)
|
|
|
|
if _assert_alias(func):
|
|
val = args[0].data_ptr()
|
|
tree_map(partial(register_storage, data_ptr_fn=lambda: val), _normalize_tuple(ret))
|
|
|
|
# Now, we want to continue propagating this tensor, so we rewrap Tensors in
|
|
# our custom tensor subclass
|
|
def wrap(x):
|
|
return MetaTensor(x, device=device) if isinstance(x, torch.Tensor) else x
|
|
|
|
return tree_map(wrap, ret)
|
|
|
|
def to(self, *args, **kwargs) -> torch.Tensor:
|
|
"""An extension of `torch.Tensor.to()` to MetaTensor
|
|
Returns:
|
|
result (MetaTensor): MetaTensor
|
|
Usage:
|
|
>>> tensor = MetaTensor(torch.rand(10), device='cuda:100')
|
|
>>> tensor.to(torch.uint8)
|
|
MetaTensor(tensor(..., device='meta', size=(10,), dtype=torch.uint8), device='cuda:100')
|
|
>>> tensor.to(torch.device('cuda:42'))
|
|
MetaTensor(tensor(..., device='meta', size=(10,)), device='cuda:42')
|
|
>>> tensor.to('vulkan')
|
|
MetaTensor(tensor(..., device='meta', size=(10,)), device='vulkan')
|
|
"""
|
|
# this imitates c++ function in the way of @overload
|
|
device = None
|
|
|
|
def replace(x):
|
|
nonlocal device
|
|
if isinstance(x, str) or isinstance(x, _device):
|
|
device = x
|
|
return torch.device('meta')
|
|
return x
|
|
|
|
elem = self._tensor.to(*tree_map(replace, args), **tree_map(replace, kwargs))
|
|
return MetaTensor(elem, device=device)
|
|
|
|
def cpu(self, *args, **kwargs):
|
|
if self.device.type == 'cpu':
|
|
return self.to(*args, **kwargs)
|
|
return self.to(*args, device='cpu', **kwargs)
|
|
|
|
def cuda(self, device=None, non_blocking=False):
|
|
if device is not None:
|
|
return self.to(device=device, non_blocking=non_blocking)
|
|
return self.to(device='cuda:0', non_blocking=non_blocking)
|
|
|
|
def data_ptr(self):
|
|
return self._tensor.data_ptr()
|
|
|
|
|
|
class MetaTensorMode(object):
|
|
"""
|
|
A context manager that enables MetaTensor mode.
|
|
|
|
Usage:
|
|
>>> with MetaTensorMode():
|
|
>>> # all torch.xxx and torch.distributed.xxx will be replaced by patched functions
|
|
>>> # and the actual execution will be on torch.device('meta')
|
|
>>> a = torch.rand(100000, 100000)
|
|
>>> b = torch.rand(100000, 100000)
|
|
>>> c = torch.mm(a, b)
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.torch_overrides = {} # override torch.xxx
|
|
self.dist_overrides = {} # override torch.distributed.xxx
|
|
|
|
def __enter__(self):
|
|
|
|
def _dummy(*args, **kwargs):
|
|
pass
|
|
|
|
def _new(*args, orig_new=torch.empty, **kwargs):
|
|
return MetaTensor(orig_new(*args, **{
|
|
**kwargs, 'device': 'meta'
|
|
}),
|
|
device=kwargs.get('device', torch.device('cpu')))
|
|
|
|
for func in _TorchOverrideableFactoryMethod:
|
|
self.torch_overrides[func] = getattr(torch, func)
|
|
setattr(torch, func, partial(_new, orig_new=getattr(torch, func)))
|
|
|
|
for func in _DistCommMethod:
|
|
self.dist_overrides[func] = getattr(dist, func)
|
|
setattr(dist, func, _dummy)
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
for func, func_impl in self.torch_overrides.items():
|
|
setattr(torch, func, func_impl)
|
|
|
|
for func, func_impl in self.dist_overrides.items():
|
|
setattr(dist, func, func_impl)
|