ColossalAI/colossalai/tensor/colo_tensor.py

from .op_wrapper import _COLOSSAL_OPS
from copy import copy
import torch
from colossalai.tensor import TensorSpec
from .const import TensorType
from colossalai.tensor import distspec
from colossalai.tensor.dist_spec_mgr import DistSpecManager
from colossalai.tensor.distspec import _DistSpec
from torch.overrides import get_default_nowrap_functions


def _convert_output(output):
    if isinstance(output, torch.Tensor) and not isinstance(output, ColoTensor):
        output = ColoTensor.from_torch_tensor(output)
    elif isinstance(output, (list, tuple)):
        output = type(output)(_convert_output(o) for o in output)
    return output


class ColoTensor(torch.Tensor):
    """ Data Structure for Tensor in Colossal-AI
    1. It contains a torch.Tensor as an attribute.
    2. It supports lazy init the tensor's payload.
    3. It can hijack the torch functions which using ColoTensors as args to our customized functions.
    4. It supports distributing the tensor's payload to the shards among processes. (TODO)
    """

    def __new__(cls, data: torch.Tensor, spec: TensorSpec = TensorSpec(distspec.replicate())) -> 'ColoTensor':
        if data is None:
            data = torch.empty(0)
        return torch.Tensor._make_subclass(cls, data, data.requires_grad)

    def __init__(self, data: torch.Tensor, spec: TensorSpec = TensorSpec(distspec.replicate())) -> None:
        self._spec = copy(spec)
        self._type = TensorType.NONMODEL
        self._graph_node = None

    @property
    def spec(self) -> TensorSpec:
        return self._spec

    def set_spec(self, spec: TensorSpec) -> None:
        spec = copy(spec)
        self.convert_to_dist_spec_(spec.dist_spec)
        self._spec = spec

    def has_spec(self) -> bool:
        return self._spec.num_action > 0

    def is_model_data(self) -> bool:
        return self._type == TensorType.MODEL

    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
        if kwargs is None:
            kwargs = {}

        if not all(issubclass(cls, t) for t in types):
            return NotImplemented
        global _COLOSSAL_OPS
        if func in _COLOSSAL_OPS:
            func = _COLOSSAL_OPS[func]

        with torch._C.DisableTorchFunction():
            ret = func(*args, **kwargs)
            if func in get_default_nowrap_functions():
                return ret
            else:
                return _convert_output(ret)

    def __repr__(self):
        return f'ColoTensor: {super().__repr__()}'

    def is_model_data(self) -> bool:
        return self._type == TensorType.MODEL

    def convert_to_dist_spec_(self, dist_spec: _DistSpec) -> None:
        with DistSpecManager.no_grad():
            self.data = DistSpecManager.handle_trans_spec(self, self.spec.dist_spec, dist_spec)
        self._spec.dist_spec = dist_spec

    def convert_to_dist_spec(self, dist_spec: _DistSpec) -> 'ColoTensor':
        spec = copy(self._spec)
        spec.dist_spec = dist_spec
        ret = DistSpecManager.handle_trans_spec(self, self.spec.dist_spec, dist_spec)
        return ColoTensor.from_torch_tensor(ret, spec)

    @staticmethod
    def from_torch_tensor(tensor: torch.Tensor, spec: TensorSpec = TensorSpec(distspec.replicate())) -> 'ColoTensor':
        tensor = tensor.as_subclass(ColoTensor)
        tensor.__init__(tensor, spec=spec)
        return tensor

    def __deepcopy__(self, memo):
        if id(self) in memo:
            return memo[id(self)]
        else:
            with torch._C.DisableTorchFunction():
                data = self.data.clone()
            tensor = ColoTensor(data, spec=copy(self.spec))
            memo[id(self)] = tensor
            return tensor
[Tensor] init a tp network training unittest (#849) 3 years ago			`from .op_wrapper import _COLOSSAL_OPS`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`from copy import copy`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 3 years ago			`import torch`
[Tensor] remove useless import in tensor dir (#997) 3 years ago			`from colossalai.tensor import TensorSpec`
[Tensor] init ColoParameter (#914) 3 years ago			`from .const import TensorType`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`from colossalai.tensor import distspec`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`from colossalai.tensor.dist_spec_mgr import DistSpecManager`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`from colossalai.tensor.distspec import _DistSpec`
			`from torch.overrides import get_default_nowrap_functions`
[Tensor] activation is an attr of ColoTensor (#897) 3 years ago
[Tensor] Add some attributes to ColoTensor (#877) * [Tensor] add some function to ColoTensor * torch.allclose * rm torch.add 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def _convert_output(output):`
			`if isinstance(output, torch.Tensor) and not isinstance(output, ColoTensor):`
			`output = ColoTensor.from_torch_tensor(output)`
			`elif isinstance(output, (list, tuple)):`
			`output = type(output)(_convert_output(o) for o in output)`
			`return output`


			`class ColoTensor(torch.Tensor):`
[tensor] lazy init (#823) 3 years ago			`""" Data Structure for Tensor in Colossal-AI`
			`1. It contains a torch.Tensor as an attribute.`
			`2. It supports lazy init the tensor's payload.`
			`3. It can hijack the torch functions which using ColoTensors as args to our customized functions.`
			`4. It supports distributing the tensor's payload to the shards among processes. (TODO)`
			`"""`
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def __new__(cls, data: torch.Tensor, spec: TensorSpec = TensorSpec(distspec.replicate())) -> 'ColoTensor':`
			`if data is None:`
			`data = torch.empty(0)`
			`return torch.Tensor._make_subclass(cls, data, data.requires_grad)`

			`def __init__(self, data: torch.Tensor, spec: TensorSpec = TensorSpec(distspec.replicate())) -> None:`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`self._spec = copy(spec)`
[Tensor] init ColoParameter (#914) 3 years ago			`self._type = TensorType.NONMODEL`
[Graph] building computing graph with ColoTensor, Linear only (#917) 3 years ago			`self._graph_node = None`
[Tensor] TP Linear 1D row (#843) 3 years ago
			`@property`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`def spec(self) -> TensorSpec:`
			`return self._spec`
[Tensor] TP Linear 1D row (#843) 3 years ago
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`def set_spec(self, spec: TensorSpec) -> None:`
			`spec = copy(spec)`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`self.convert_to_dist_spec_(spec.dist_spec)`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`self._spec = spec`
[tensor] refine linear and add gather for laynorm (#893) * refine linear and add function to ColoTensor * add gather for layernorm * polish * polish 3 years ago
			`def has_spec(self) -> bool:`
[tensor] design DistSpec and DistSpecManager for ColoTensor (#934) * add dist spec * update linear op * polish code * polish code * update embedding op * polish unit tests * polish unit tests * polish comments * polish code * add test_dist_spec_mgr * polish code * refactor folder structure * polish unit tests * add get_process_group() for TensorSpec * polish code 3 years ago			`return self._spec.num_action > 0`
[tensor] refine linear and add gather for laynorm (#893) * refine linear and add function to ColoTensor * add gather for layernorm * polish * polish 3 years ago
[Tensor] init ColoParameter (#914) 3 years ago			`def is_model_data(self) -> bool:`
			`return self._type == TensorType.MODEL`
[Tensor] activation is an attr of ColoTensor (#897) 3 years ago
[gemini] a new tensor structure (#818) * Revert "[zero] add ZeroTensorShardStrategy (#793)" This reverts commit 88759e289efd0a7b5e0d7bf8e01dbe29db85cf71. * [gemini] set cpu memory capacity * [log] local throughput collecting * polish * polish * polish * polish code * polish * polish code * add a new tensor structure and override linear for it * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish 3 years ago			`@classmethod`
			`def __torch_function__(cls, func, types, args=(), kwargs=None):`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`if kwargs is None:`
			`kwargs = {}`

			`if not all(issubclass(cls, t) for t in types):`
			`return NotImplemented`
[tensor] reorganize files (#820) 3 years ago			`global _COLOSSAL_OPS`
			`if func in _COLOSSAL_OPS:`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`func = _COLOSSAL_OPS[func]`
[tensor] colo tensor overrides mul (#927) * colo tensor overrides mul * polish code 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`with torch._C.DisableTorchFunction():`
			`ret = func(args, *kwargs)`
			`if func in get_default_nowrap_functions():`
			`return ret`
			`else:`
			`return _convert_output(ret)`
[tensor] colo tensor overrides mul (#927) * colo tensor overrides mul * polish code 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def __repr__(self):`
			`return f'ColoTensor: {super().__repr__()}'`
[Tensor] make ColoTensor more robust for getattr (#886) * [Tensor] make ColoTensor more robust for getattr * polish * polish 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def is_model_data(self) -> bool:`
			`return self._type == TensorType.MODEL`
[Tensor] add from_pretrained support and bert pretrained test (#921) * add from_pretrained support and test * polish * polish * polish * polish 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def convert_to_dist_spec_(self, dist_spec: _DistSpec) -> None:`
			`with DistSpecManager.no_grad():`
			`self.data = DistSpecManager.handle_trans_spec(self, self.spec.dist_spec, dist_spec)`
			`self._spec.dist_spec = dist_spec`
[Tensor] make ColoTensor more robust for getattr (#886) * [Tensor] make ColoTensor more robust for getattr * polish * polish 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`def convert_to_dist_spec(self, dist_spec: _DistSpec) -> 'ColoTensor':`
			`spec = copy(self._spec)`
			`spec.dist_spec = dist_spec`
			`ret = DistSpecManager.handle_trans_spec(self, self.spec.dist_spec, dist_spec)`
			`return ColoTensor.from_torch_tensor(ret, spec)`
[Tensor] make a simple net works with 1D row TP (#879) 3 years ago
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`@staticmethod`
			`def from_torch_tensor(tensor: torch.Tensor, spec: TensorSpec = TensorSpec(distspec.replicate())) -> 'ColoTensor':`
			`tensor = tensor.as_subclass(ColoTensor)`
			`tensor.__init__(tensor, spec=spec)`
			`return tensor`

			`def __deepcopy__(self, memo):`
			`if id(self) in memo:`
			`return memo[id(self)]`
[tensor] colo tensor overrides mul (#927) * colo tensor overrides mul * polish code 3 years ago			`else:`
[tensor] refactor colo-tensor (#992) * refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable 3 years ago			`with torch._C.DisableTorchFunction():`
			`data = self.data.clone()`
			`tensor = ColoTensor(data, spec=copy(self.spec))`
			`memo[id(self)] = tensor`
			`return tensor`