ColossalAI/colossalai/zero/sharded_param/sharded_param.py

import torch
import torch.distributed as dist
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.zero.sharded_model._zero3_utils import get_shard
from typing import Union, Tuple, Optional
import numpy


class ShardedParam(object):
    r"""
    A wrapper to torch.nn.Parameter. Shard a param
    on memory space of different processes.
    """

    def __init__(self,
                 other: Union[torch.nn.Parameter, Tuple[int, ...]],
                 process_group: Optional[dist.ProcessGroup] = None,
                 is_sharded: bool = False,
                 device: Optional[torch.device] = None) -> None:
        r"""
        other: either an existing torch parameter or a tuple, indicate allocate a new param with the tuple as shape.
        process_group: the process group storing the shared data.
        is_sharded: is shared the param during __init__.
        device: the device to place param data payload on
        """
        self.process_group = process_group or gpc.get_group(ParallelMode.DATA)
        self.world_size = dist.get_world_size(self.process_group)
        self.local_rank = dist.get_rank(self.process_group)
        self.is_sharded = False

        # Hijack the data payload of param
        if isinstance(other, torch.nn.Parameter):
            self._param_payload = other.data.to(device)
            self._origin_shape = other.shape
            self._origin_numel = other.numel()
            if is_sharded:
                self.shard()
        elif isinstance(other, tuple):
            self._origin_shape = other
            self._origin_numel = numpy.prod(other)

            # TODO(jiaruifang) can be optimized. Directly allocate payload as the sharded shape.
            assert device is not None, "You have to assign a device to initialize a ShardParam from a shape tuple"
            self._param_payload = torch.empty(self._origin_shape, device=device)
            if is_sharded:
                self.shard()
        else:
            raise RuntimeError(f"Initialize ShardParam failed. The 2nd parameter is wrong type {type(other)}")

        self._payload_numel = None

    def payload(self, target_device: torch.device):
        r"""
        get the payload and move it to target device
        """
        return self._param_payload.to(target_device)

    def shard(self):
        r"""
        Distributed the payload of param to all processes.
        """
        if self.is_sharded:
            return
        self._param_payload, _ = get_shard(self._param_payload, self.local_rank, self.world_size)
        self.is_sharded = True

    def gather(self):
        r"""
        Collect the payload of param from different processes to process of local rank.
        The payload has to be moved to cuda memory before communication.
        """
        if not self.is_sharded:
            return

        buffer_list = []
        payload_numel = self._param_payload.numel()
        for i in range(self.world_size):
            if i == self.local_rank:
                buffer_list.append(self._param_payload.cuda())
            else:
                buffer_list.append(torch.zeros(payload_numel).cuda())

        torch.distributed.all_gather(buffer_list,
                                     buffer_list[self.local_rank],
                                     group=self.process_group,
                                     async_op=False)
        self._param_payload = torch.narrow(torch.cat(buffer_list), 0, 0, self._origin_numel).view(self._origin_shape)
        self.is_sharded = False

    @property
    def origin_dtype(self):
        return self._origin_dtype
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`import torch`
[zero] add sharded grad and refactor grad hooks for ShardedModel (#287) 2022-03-02 10:28:29 +00:00			`import torch.distributed as dist`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`from colossalai.context.parallel_mode import ParallelMode`
			`from colossalai.core import global_context as gpc`
[zero] add sharded grad and refactor grad hooks for ShardedModel (#287) 2022-03-02 10:28:29 +00:00			`from colossalai.zero.sharded_model._zero3_utils import get_shard`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`from typing import Union, Tuple, Optional`
			`import numpy`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`class ShardedParam(object):`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`r"""`
			`A wrapper to torch.nn.Parameter. Shard a param`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`on memory space of different processes.`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`"""`
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`def __init__(self,`
			`other: Union[torch.nn.Parameter, Tuple[int, ...]],`
			`process_group: Optional[dist.ProcessGroup] = None,`
			`is_sharded: bool = False,`
			`device: Optional[torch.device] = None) -> None:`
			`r"""`
			`other: either an existing torch parameter or a tuple, indicate allocate a new param with the tuple as shape.`
			`process_group: the process group storing the shared data.`
			`is_sharded: is shared the param during __init__.`
			`device: the device to place param data payload on`
			`"""`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`self.process_group = process_group or gpc.get_group(ParallelMode.DATA)`
			`self.world_size = dist.get_world_size(self.process_group)`
			`self.local_rank = dist.get_rank(self.process_group)`
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00			`self.is_sharded = False`

Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`# Hijack the data payload of param`
			`if isinstance(other, torch.nn.Parameter):`
			`self._param_payload = other.data.to(device)`
			`self._origin_shape = other.shape`
			`self._origin_numel = other.numel()`
			`if is_sharded:`
			`self.shard()`
			`elif isinstance(other, tuple):`
			`self._origin_shape = other`
			`self._origin_numel = numpy.prod(other)`

			`# TODO(jiaruifang) can be optimized. Directly allocate payload as the sharded shape.`
			`assert device is not None, "You have to assign a device to initialize a ShardParam from a shape tuple"`
			`self._param_payload = torch.empty(self._origin_shape, device=device)`
			`if is_sharded:`
			`self.shard()`
			`else:`
			`raise RuntimeError(f"Initialize ShardParam failed. The 2nd parameter is wrong type {type(other)}")`

			`self._payload_numel = None`

fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00			`def payload(self, target_device: torch.device):`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`r"""`
			`get the payload and move it to target device`
			`"""`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`return self._param_payload.to(target_device)`

			`def shard(self):`
			`r"""`
			`Distributed the payload of param to all processes.`
			`"""`
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00			`if self.is_sharded:`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`return`
			`self._param_payload, _ = get_shard(self._param_payload, self.local_rank, self.world_size)`
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00			`self.is_sharded = True`

Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`def gather(self):`
			`r"""`
			`Collect the payload of param from different processes to process of local rank.`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`The payload has to be moved to cuda memory before communication.`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`"""`
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00			`if not self.is_sharded:`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`return`
fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`buffer_list = []`
			`payload_numel = self._param_payload.numel()`
			`for i in range(self.world_size):`
			`if i == self.local_rank:`
			`buffer_list.append(self._param_payload.cuda())`
			`else:`
			`buffer_list.append(torch.zeros(payload_numel).cuda())`

fixed typo in ShardParam (#294) 2022-03-02 09:26:23 +00:00			`torch.distributed.all_gather(buffer_list,`
			`buffer_list[self.local_rank],`
			`group=self.process_group,`
			`async_op=False)`
			`self._param_payload = torch.narrow(torch.cat(buffer_list), 0, 0, self._origin_numel).view(self._origin_shape)`
			`self.is_sharded = False`
[zero] add sharded grad and refactor grad hooks for ShardedModel (#287) 2022-03-02 10:28:29 +00:00
			`@property`
			`def origin_dtype(self):`
			`return self._origin_dtype`