ColossalAI/colossalai/tensor/colo_tensor.py

from colossalai.context import parallel_mode
from .op_wrapper import _COLOSSAL_OPS

import torch
from typing import Tuple, Optional, Callable
from numpy import product
from colossalai.core import global_context as gpc
from colossalai.nn.layer.utils import divide
from colossalai.tensor import TensorSpec, ComputePattern, ParallelAction


class ColoTensor(object):
    """ Data Structure for Tensor in Colossal-AI
    1. It contains a torch.Tensor as an attribute.
    2. It supports lazy init the tensor's payload.
    3. It can hijack the torch functions which using ColoTensors as args to our customized functions.
    4. It supports distributing the tensor's payload to the shards among processes. (TODO)
    """

    def __new__(cls, *args, **kwargs):
        return super(ColoTensor, cls).__new__(cls)

    def __init__(
            self,
            *size: Tuple[int],
            dtype=None,
            requires_grad=False,
            pin_memory=False,
            device=None,
            torch_tensor=torch.empty(0),
            shard_spec: TensorSpec = TensorSpec(),
    ):
        self._size = size
        self._dtype = dtype
        self._requires_grad = requires_grad
        self._pin_memory = pin_memory
        self._device = device
        self._torch_tensor = torch_tensor
        self._shard_spec = shard_spec

    def __getitem__(self, key):
        return ColoTensor.init_from_torch_tensor(self.torch_tensor()[key])

    @property
    def shard_spec(self) -> TensorSpec:
        return self._shard_spec

    @property
    def data(self):
        return self._torch_tensor.data

    @property
    def grad(self):
        return self._torch_tensor.grad

    @property
    def size(self):
        return self._size

    @property
    def shape(self):
        return torch.Size(self._size)

    @property
    def device(self):
        return self._torch_tensor.device

    def size(self, dim=None):
        if dim is None:
            return self.shape
        return self._size[dim]

    def dim(self):
        return len(self._size)

    def normal_(self, mean=0., std=1.):
        torch_tensor = self.torch_tensor()
        return torch_tensor.normal_(mean=mean, std=std)

    def numel(self):
        return product(self._size)

    @staticmethod
    def init_from_torch_tensor(tensor: torch.Tensor, save_payload=True) -> 'ColoTensor':
        colo_t = ColoTensor(*tensor.size(),
                            dtype=tensor.dtype,
                            requires_grad=tensor.requires_grad,
                            pin_memory=tensor.is_pinned(),
                            device=tensor.device,
                            torch_tensor=tensor if save_payload else torch.empty(0))
        return colo_t

    def del_torch_tensor(self, save_shape=False) -> None:
        """
        delete the payload of the torch tensor.

        Args:
            save_shape (bool, optional): if saving the shape of the torch_tensor. 
            If saving the shape, the size of self._torch_tensor is inconsist with the self._size.
            Defaults to False.
        """
        if not save_shape:
            self._size = (0,)
        self._torch_tensor = torch.empty((0,), device=self._device, dtype=self._dtype)

    def torch_tensor(self) -> torch.Tensor:
        if self._torch_tensor.numel() == 0:
            self._torch_tensor = torch.empty(*self._size,
                                             dtype=self._dtype,
                                             pin_memory=self._pin_memory,
                                             requires_grad=self._requires_grad,
                                             device=self._device)
        return self._torch_tensor

    def set_spec(self, spec: TensorSpec, lazy_shard: bool = False) -> None:
        self._shard_spec = spec
        if lazy_shard == False:
            self._shard()

    def _shard(self):
        assert self._shard_spec is not None, 'You should call set_spec() before _shard() ColoTensor.'
        if self._shard_spec.num_action == 1:
            if ComputePattern.TP1DRow in self._shard_spec.compute_patterns:
                parallel_action = self._shard_spec.get_action_by_compute_pattern(ComputePattern.TP1DRow)
                num_partition = gpc.get_world_size(parallel_action.parallel_mode)
                local_rank = gpc.get_local_rank(parallel_action.parallel_mode)
                dim = -1
                chunk_size = divide(self._size[dim], num_partition)
                # Reshape to get shard for this rank and we don't want autograd
                # recording here for the narrow op and 'local_shard' should be a
                # leaf variable in the autograd graph.
                self._torch_tensor = self._torch_tensor.narrow(dim, local_rank * chunk_size, chunk_size).detach(
                ).contiguous()    # TODO Shall we clone() here since detach() will point to the old tensor?
                self._torch_tensor.requires_grad = self._requires_grad
                self._size = self._torch_tensor.size()

    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
        global _COLOSSAL_OPS
        if func in _COLOSSAL_OPS:
            for arg in args:
                if isinstance(arg, ColoTensor):
                    return _COLOSSAL_OPS[func](types, args, kwargs, None)

            for kwarg in kwargs.values():
                if isinstance(kwarg, ColoTensor):
                    return _COLOSSAL_OPS[func](types, args, kwargs, None)
        else:
            # If we have not hijact the function, convert the ColoTensors in args and kwargs to torch tensors.
            args = [arg.torch_tensor() if isinstance(arg, ColoTensor) else arg for arg in args]
            if kwargs is None:
                kwargs = {}

            kwargs = {k: v.torch_tensor() if isinstance(v, ColoTensor) else v for k, v in kwargs.items()}
            return cls._filter_outputs_with_colo(func(*args,**kwargs))

    def backward(self, gradient: Optional[torch.Tensor] = None, retain_graph: bool = False):
        self._torch_tensor.backward(gradient=gradient, retain_graph=retain_graph)

    def __getattr__(self, name):
        def replace_tensor_with_colo(func):
            def execute_func(*args, **kwargs):
                return self._filter_outputs_with_colo(func(*args,**kwargs))
            return execute_func

        attr = getattr(self._torch_tensor, name)
        if isinstance(attr, Callable):
            return replace_tensor_with_colo(attr)
        else:
            return attr

    @classmethod
    def _filter_outputs_with_colo(cls, outputs):
        if outputs is None: # return None
            return None
        elif type(outputs) is not tuple: # num of return val = 1
            return ColoTensor.init_from_torch_tensor(outputs) if type(outputs) is torch.Tensor else outputs
        else: # num of return val > 1
            return tuple([ColoTensor.init_from_torch_tensor(output) if type(output) is torch.Tensor else output for output in outputs])
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`from colossalai.context import parallel_mode`
[Tensor] init a tp network training unittest (#849) 2022-04-24 08:43:44 +00:00			`from .op_wrapper import _COLOSSAL_OPS`

[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00			`import torch`
[tensor] wrap function in the torch_tensor to ColoTensor (#881) 2022-04-26 12:13:56 +00:00			`from typing import Tuple, Optional, Callable`
[Tensor] init a tp network training unittest (#849) 2022-04-24 08:43:44 +00:00			`from numpy import product`
[Tensor ] Add 1Drow weight reshard by spec (#854) 2022-04-24 10:30:20 +00:00			`from colossalai.core import global_context as gpc`
			`from colossalai.nn.layer.utils import divide`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`from colossalai.tensor import TensorSpec, ComputePattern, ParallelAction`
[Tensor] add layer norm Op (#852) 2022-04-25 03:49:20 +00:00
[Tensor] Add some attributes to ColoTensor (#877) * [Tensor] add some function to ColoTensor * torch.allclose * rm torch.add 2022-04-26 07:10:47 +00:00
[tensor] reorganize files (#820) 2022-04-21 06:15:48 +00:00			`class ColoTensor(object):`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`""" Data Structure for Tensor in Colossal-AI`
			`1. It contains a torch.Tensor as an attribute.`
			`2. It supports lazy init the tensor's payload.`
			`3. It can hijack the torch functions which using ColoTensors as args to our customized functions.`
			`4. It supports distributing the tensor's payload to the shards among processes. (TODO)`
			`"""`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00
			`def __new__(cls, args, *kwargs):`
[tensor] reorganize files (#820) 2022-04-21 06:15:48 +00:00			`return super(ColoTensor, cls).__new__(cls)`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`def __init__(`
[tensor] ZeRO use ColoTensor as the base class. (#828) * [refactor] moving InsertPostInitMethodToModuleSubClasses to utils. * [tensor] ZeRO use ColoTensor as the base class. * polish 2022-04-22 04:00:48 +00:00			`self,`
			`*size: Tuple[int],`
			`dtype=None,`
			`requires_grad=False,`
			`pin_memory=False,`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`device=None,`
[tensor] ZeRO use ColoTensor as the base class. (#828) * [refactor] moving InsertPostInitMethodToModuleSubClasses to utils. * [tensor] ZeRO use ColoTensor as the base class. * polish 2022-04-22 04:00:48 +00:00			`torch_tensor=torch.empty(0),`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`shard_spec: TensorSpec = TensorSpec(),`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`):`
			`self._size = size`
			`self._dtype = dtype`
			`self._requires_grad = requires_grad`
			`self._pin_memory = pin_memory`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`self._device = device`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`self._torch_tensor = torch_tensor`
[Tensor] TP Linear 1D row (#843) 2022-04-24 05:43:12 +00:00			`self._shard_spec = shard_spec`

[Tensor] Add some attributes to ColoTensor (#877) * [Tensor] add some function to ColoTensor * torch.allclose * rm torch.add 2022-04-26 07:10:47 +00:00			`def __getitem__(self, key):`
			`return ColoTensor.init_from_torch_tensor(self.torch_tensor()[key])`

[Tensor] TP Linear 1D row (#843) 2022-04-24 05:43:12 +00:00			`@property`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`def shard_spec(self) -> TensorSpec:`
[Tensor] TP Linear 1D row (#843) 2022-04-24 05:43:12 +00:00			`return self._shard_spec`

			`@property`
			`def data(self):`
			`return self._torch_tensor.data`

			`@property`
			`def grad(self):`
			`return self._torch_tensor.grad`

			`@property`
			`def size(self):`
			`return self._size`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00
[pipelinable]use ColoTensor to replace dummy tensor. (#853) 2022-04-24 10:31:22 +00:00			`@property`
			`def shape(self):`
			`return torch.Size(self._size)`

colo init context add device attr. (#866) 2022-04-25 06:24:26 +00:00			`@property`
			`def device(self):`
			`return self._torch_tensor.device`

[pipelinable]use ColoTensor to replace dummy tensor. (#853) 2022-04-24 10:31:22 +00:00			`def size(self, dim=None):`
			`if dim is None:`
			`return self.shape`
			`return self._size[dim]`

			`def dim(self):`
			`return len(self._size)`

			`def normal_(self, mean=0., std=1.):`
			`torch_tensor = self.torch_tensor()`
			`return torch_tensor.normal_(mean=mean, std=std)`

[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`def numel(self):`
[hotfix] the bug of numel() in ColoTensor (#845) 2022-04-24 04:32:10 +00:00			`return product(self._size)`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`@staticmethod`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`def init_from_torch_tensor(tensor: torch.Tensor, save_payload=True) -> 'ColoTensor':`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`colo_t = ColoTensor(*tensor.size(),`
			`dtype=tensor.dtype,`
			`requires_grad=tensor.requires_grad,`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`pin_memory=tensor.is_pinned(),`
			`device=tensor.device,`
			`torch_tensor=tensor if save_payload else torch.empty(0))`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`return colo_t`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00
Init Conext supports lazy allocate model memory (#842) 2022-04-22 10:03:35 +00:00			`def del_torch_tensor(self, save_shape=False) -> None:`
[hotfix] the bug of numel() in ColoTensor (#845) 2022-04-24 04:32:10 +00:00			`"""`
			`delete the payload of the torch tensor.`

			`Args:`
			`save_shape (bool, optional): if saving the shape of the torch_tensor.`
			`If saving the shape, the size of self._torch_tensor is inconsist with the self._size.`
			`Defaults to False.`
			`"""`
			`if not save_shape:`
Init Conext supports lazy allocate model memory (#842) 2022-04-22 10:03:35 +00:00			`self._size = (0,)`
[hotfix] the bug of numel() in ColoTensor (#845) 2022-04-24 04:32:10 +00:00			`self._torch_tensor = torch.empty((0,), device=self._device, dtype=self._dtype)`
[tensor] ZeRO use ColoTensor as the base class. (#828) * [refactor] moving InsertPostInitMethodToModuleSubClasses to utils. * [tensor] ZeRO use ColoTensor as the base class. * polish 2022-04-22 04:00:48 +00:00
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00			`def torch_tensor(self) -> torch.Tensor:`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`if self._torch_tensor.numel() == 0:`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`self._torch_tensor = torch.empty(*self._size,`
			`dtype=self._dtype,`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`pin_memory=self._pin_memory,`
[tensor] lazy init (#823) 2022-04-21 07:40:23 +00:00			`requires_grad=self._requires_grad,`
[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`device=self._device)`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00			`return self._torch_tensor`

[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`def set_spec(self, spec: TensorSpec, lazy_shard: bool = False) -> None:`
[Tensor ] Add 1Drow weight reshard by spec (#854) 2022-04-24 10:30:20 +00:00			`self._shard_spec = spec`
			`if lazy_shard == False:`
			`self._shard()`

			`def _shard(self):`
			`assert self._shard_spec is not None, 'You should call set_spec() before _shard() ColoTensor.'`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`if self._shard_spec.num_action == 1:`
			`if ComputePattern.TP1DRow in self._shard_spec.compute_patterns:`
			`parallel_action = self._shard_spec.get_action_by_compute_pattern(ComputePattern.TP1DRow)`
			`num_partition = gpc.get_world_size(parallel_action.parallel_mode)`
			`local_rank = gpc.get_local_rank(parallel_action.parallel_mode)`
			`dim = -1`
			`chunk_size = divide(self._size[dim], num_partition)`
			`# Reshape to get shard for this rank and we don't want autograd`
			`# recording here for the narrow op and 'local_shard' should be a`
			`# leaf variable in the autograd graph.`
			`self._torch_tensor = self._torch_tensor.narrow(dim, local_rank * chunk_size, chunk_size).detach(`
			`).contiguous() # TODO Shall we clone() here since detach() will point to the old tensor?`
			`self._torch_tensor.requires_grad = self._requires_grad`
			`self._size = self._torch_tensor.size()`
[Tensor ] Add 1Drow weight reshard by spec (#854) 2022-04-24 10:30:20 +00:00
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00			`@classmethod`
			`def __torch_function__(cls, func, types, args=(), kwargs=None):`
[tensor] reorganize files (#820) 2022-04-21 06:15:48 +00:00			`global _COLOSSAL_OPS`
			`if func in _COLOSSAL_OPS:`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00			`for arg in args:`
[tensor] reorganize files (#820) 2022-04-21 06:15:48 +00:00			`if isinstance(arg, ColoTensor):`
			`return _COLOSSAL_OPS[func](types, args, kwargs, None)`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 2022-04-21 03:42:37 +00:00
			`for kwarg in kwargs.values():`
[tensor] reorganize files (#820) 2022-04-21 06:15:48 +00:00			`if isinstance(kwarg, ColoTensor):`
			`return _COLOSSAL_OPS[func](types, args, kwargs, None)`
[Tensor] update ColoTensor torch_function (#822) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * [tensor] renaming and reorganize directory structure. * rm useless dir * polish * polish * [tensor] hander the function not wrapped * polish 2022-04-21 06:25:27 +00:00			`else:`
			`# If we have not hijact the function, convert the ColoTensors in args and kwargs to torch tensors.`
			`args = [arg.torch_tensor() if isinstance(arg, ColoTensor) else arg for arg in args]`
			`if kwargs is None:`
			`kwargs = {}`

[hotfix] ColoTensor pin_memory (#840) 2022-04-22 09:07:46 +00:00			`kwargs = {k: v.torch_tensor() if isinstance(v, ColoTensor) else v for k, v in kwargs.items()}`
[tensor] wrap function in the torch_tensor to ColoTensor (#881) 2022-04-26 12:13:56 +00:00			`return cls._filter_outputs_with_colo(func(args,*kwargs))`
[Tensor] add layer norm Op (#852) 2022-04-25 03:49:20 +00:00
[Tensor] Add some attributes to ColoTensor (#877) * [Tensor] add some function to ColoTensor * torch.allclose * rm torch.add 2022-04-26 07:10:47 +00:00			`def backward(self, gradient: Optional[torch.Tensor] = None, retain_graph: bool = False):`
[Tensor] Add function to spec and update linear 1Drow and unit tests (#869) 2022-04-26 02:15:26 +00:00			`self._torch_tensor.backward(gradient=gradient, retain_graph=retain_graph)`
[Tensor] Add some attributes to ColoTensor (#877) * [Tensor] add some function to ColoTensor * torch.allclose * rm torch.add 2022-04-26 07:10:47 +00:00
[tensor] wrap function in the torch_tensor to ColoTensor (#881) 2022-04-26 12:13:56 +00:00			`def __getattr__(self, name):`
			`def replace_tensor_with_colo(func):`
			`def execute_func(args, *kwargs):`
			`return self._filter_outputs_with_colo(func(args,*kwargs))`
			`return execute_func`
[Tensor] make a simple net works with 1D row TP (#879) 2022-04-26 10:11:47 +00:00
[tensor] wrap function in the torch_tensor to ColoTensor (#881) 2022-04-26 12:13:56 +00:00			`attr = getattr(self._torch_tensor, name)`
			`if isinstance(attr, Callable):`
			`return replace_tensor_with_colo(attr)`
			`else:`
			`return attr`
[Tensor] make a simple net works with 1D row TP (#879) 2022-04-26 10:11:47 +00:00
[tensor] wrap function in the torch_tensor to ColoTensor (#881) 2022-04-26 12:13:56 +00:00			`@classmethod`
			`def _filter_outputs_with_colo(cls, outputs):`
			`if outputs is None: # return None`
			`return None`
			`elif type(outputs) is not tuple: # num of return val = 1`
			`return ColoTensor.init_from_torch_tensor(outputs) if type(outputs) is torch.Tensor else outputs`
			`else: # num of return val > 1`
			`return tuple([ColoTensor.init_from_torch_tensor(output) if type(output) is torch.Tensor else output for output in outputs])`