mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
3.0 KiB
81 lines
3.0 KiB
import torch.distributed as dist
|
|
from enum import Enum
|
|
from typing import List, Optional
|
|
from colossalai.tensor.distspec import _DistSpec, DistPlacementPattern
|
|
|
|
|
|
class ComputePattern(Enum):
|
|
TP1D = 0
|
|
TP2D = 1
|
|
TP2P5D = 2
|
|
TP3D = 3
|
|
|
|
|
|
class ParallelAction(object):
|
|
|
|
def __init__(self, compute_pattern: ComputePattern, gather_out: bool = True) -> None:
|
|
assert isinstance(compute_pattern, ComputePattern)
|
|
self.compute_pattern = compute_pattern
|
|
self.gather_out = gather_out
|
|
|
|
def __repr__(self):
|
|
return f'compute pattern: {self.compute_pattern}, gather out: {self.gather_out}'
|
|
|
|
|
|
class TensorSpec(object):
|
|
"""
|
|
It contains two aspects of information:
|
|
First, How are tensors distributed in Heterougenous memory space.
|
|
Second, if the tensor is a model parameter, the Spec contains the
|
|
parallel computation pattern of the Operator (Layer).
|
|
We have to consider the hybrid parallel mode.
|
|
"""
|
|
|
|
# a list of parallel actions.
|
|
# For example: On 8 GPUs, a hybrid parallel strategy is applied using
|
|
# using ZeRO with DP-degree = 4 and 1DRowTP with TP-degree = 2.
|
|
# parallel_action_list = [
|
|
# ParallelAction(10, ComputePattern.ZeRO, gpc.get_group(ParallelMode.DATA)),
|
|
# ParallelAction(1, ComputePattern.TP1D_Linear, gpc.get_group(ParallelMode.PARALLEL_1D))
|
|
# ]
|
|
# When the ColoTensor is initialized,
|
|
# we first splitting tensor according to ParallelAction of ZeRO,
|
|
# then splitting tensor according to ParallelAction of TP1D_Linear.
|
|
# During Linear computation
|
|
# Before Linear Op, we gather the tensors according to ZeRO.
|
|
# We perform Linear Op according to compute pattern of TP1D_Linear.
|
|
# After Linear Op, we split the tensors according to ZeRO.
|
|
|
|
def __init__(self, dist_spec: _DistSpec, parallel_action: Optional[ParallelAction] = None):
|
|
self.parallel_action = parallel_action
|
|
self.dist_spec = dist_spec
|
|
|
|
def get_process_group(self):
|
|
return self.dist_spec.process_group
|
|
|
|
def get_process_group_size(self):
|
|
return dist.get_world_size(self.dist_spec.process_group)
|
|
|
|
def get_placement(self):
|
|
return self.dist_spec.placement
|
|
|
|
def is_gathered(self):
|
|
return self.dist_spec.placement == DistPlacementPattern.REPLICATE \
|
|
or (len(self.dist_spec.num_partitions) == 1
|
|
and self.dist_spec.num_partitions[0] == 1) \
|
|
or (self.dist_spec.process_group.size() == 1)
|
|
|
|
def is_1D_col(self):
|
|
return self.dist_spec.placement == DistPlacementPattern.SHARD \
|
|
and len(self.dist_spec.dims) == 1 and self.dist_spec.dims[0] == -1
|
|
|
|
def is_1D_row(self):
|
|
return self.dist_spec.placement == DistPlacementPattern.SHARD \
|
|
and len(self.dist_spec.dims) == 1 and self.dist_spec.dims[0] == 0
|
|
|
|
def has_compute_pattern(self, compute_pattern: ComputePattern):
|
|
return self.parallel_action.compute_pattern == compute_pattern
|
|
|
|
def __repr__(self):
|
|
return f'parallel action: {self.parallel_action}, dist_spec: {self.dist_spec}'
|