mirror of https://github.com/hpcaitech/ColossalAI
93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
from typing import List
|
|
|
|
from torch import Tensor
|
|
from torch._utils import _flatten_dense_tensors
|
|
|
|
from .base_store import BaseStore
|
|
|
|
|
|
class GradientStore(BaseStore):
|
|
|
|
def __init__(self, *args, partition_grad: bool = False):
|
|
super().__init__(*args)
|
|
"""
|
|
self._grads_of_params mapping the paramater and its gradient slices
|
|
data structure:
|
|
{
|
|
group_id:{
|
|
param_id: [grad_rank0, grad_rank1, ...]
|
|
}
|
|
}
|
|
"""
|
|
self._grads_of_params = dict()
|
|
# for zero2, it's `param_id: [grad_local_rank]`
|
|
self._working_index = 0 if partition_grad else self._local_rank
|
|
|
|
def get_partitioned_gradients_by_param_id(self, group_id: int, param_id: int) -> List:
|
|
"""Return list of gradient slices of a specific parameter
|
|
|
|
Args:
|
|
group_id (int): The index of a parameter group
|
|
param_id (int): The id of a parameter
|
|
|
|
Returns:
|
|
List: the list of gradient slices of a parameter.
|
|
"""
|
|
|
|
if group_id in self._grads_of_params:
|
|
if param_id in self._grads_of_params[group_id]:
|
|
return self._grads_of_params[group_id][param_id]
|
|
# the param has no grad, for instance, in layer drop
|
|
return []
|
|
|
|
def append_gradients_by_param_id(self, grad: Tensor, group_id: int, param_id: int):
|
|
"""Append a gradient slice to the parameter's gradient slice list
|
|
|
|
Args:
|
|
grad (Tensor): The gradient slice to append to list
|
|
group_id (int): The index of a parameter group
|
|
param_id (int): The id of a parameter
|
|
"""
|
|
|
|
if group_id not in self._grads_of_params:
|
|
self._grads_of_params[group_id] = dict()
|
|
if param_id not in self._grads_of_params[group_id]:
|
|
self._grads_of_params[group_id][param_id] = [grad]
|
|
else:
|
|
self._grads_of_params[group_id][param_id].append(grad)
|
|
|
|
def add_gradients_by_param_id(self, grad: Tensor, grad_idx: int, group_id: int, param_id: int):
|
|
"""Add a gradient slice on an existing slice of the parameter's gradient
|
|
Used when no_sync is not activated.
|
|
|
|
Args:
|
|
grad (Tensor): The split gradient to append to list
|
|
grad_idx (int): The index of the existing slice
|
|
group_id (int): The index of a parameter group
|
|
param_id (int): The id of a parameter
|
|
"""
|
|
|
|
self._grads_of_params[group_id][param_id][grad_idx].add_(grad)
|
|
|
|
def get_working_grads_by_group_id(self, group_id: int) -> List:
|
|
"""Return list of working gradient slices in the group
|
|
|
|
Args:
|
|
group_id (int): The index of a parameter group
|
|
|
|
Returns:
|
|
List: the list working gradient slices in the group
|
|
"""
|
|
|
|
grad_list = []
|
|
for param_grads in self._grads_of_params[group_id].values():
|
|
grad_list.append(param_grads[self._working_index])
|
|
|
|
return grad_list
|
|
|
|
def reset_grads_by_group_id(self, group_id: int):
|
|
self._grads_of_params[group_id] = dict()
|
|
|
|
def reset_all_gradients(self):
|
|
self._grads_of_params = dict()
|