mirror of https://github.com/hpcaitech/ColossalAI
[nfc] fix typo colossalai/zero (#3923)
parent
a9d1cadc49
commit
de0d7df33f
|
@ -238,7 +238,7 @@ def initialize(model: nn.Module,
|
|||
loaded into gpc.config.
|
||||
|
||||
Args:
|
||||
model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
|
||||
model (:class:`torch.nn.Module` or Callable): Your model instance or a function to build the model.
|
||||
optimizer (:class:`torch.optim.optimizer.Optimizer` or :class:`Type[torch.optim.optimizer]`):
|
||||
Your optimizer instance.
|
||||
criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
|
||||
|
|
|
@ -7,7 +7,7 @@ def colo_model_optimizer_usage(optim) -> Tuple[int, int]:
|
|||
"""Trace the optimizer memory usage
|
||||
|
||||
Args:
|
||||
optim (ShardedOptimV2): an instance of ShardedOptimver
|
||||
optim (ShardedOptimV2): an instance of ShardedOptimizer
|
||||
|
||||
Returns:
|
||||
Tuple[int, int]: cuda/cpu memory usage in Byte
|
||||
|
|
|
@ -46,7 +46,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
|
|||
"""A context to initialize model.
|
||||
|
||||
1. Convert the model to fp16.
|
||||
2. The paramaters of the module are adapted to type ShardedParameter.
|
||||
2. The parameters of the module are adapted to type ShardedParameter.
|
||||
3. Shard the param and grad according to flags.
|
||||
|
||||
Args:
|
||||
|
|
|
@ -69,7 +69,7 @@ class ShardedModelV2(nn.Module):
|
|||
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
|
||||
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
|
||||
Defaults to 'cuda'.
|
||||
gradient_predivide_factor (Optional[float], optional): Gradient is divived by this value before reduce-scatter. Defaults to 1.0.
|
||||
gradient_predivide_factor (Optional[float], optional): Gradient is divided by this value before reduce-scatter. Defaults to 1.0.
|
||||
reuse_fp16_shard (bool, optional): Whether to reuse fp16 shard for param and grad.
|
||||
Enabling this can reduce GPU memory usage, but you have to make sure you disable it when using gradient accumulation.
|
||||
In this mode, grad will be fp16. Make sure your optimizer supports mixed precision (fp32 param and fp16 grad).
|
||||
|
@ -205,7 +205,7 @@ class ShardedModelV2(nn.Module):
|
|||
exit(0)
|
||||
"""
|
||||
if self._use_memory_tracer:
|
||||
self.logger.error(f'dump memort tracer collected information to a {filename}', ranks=[0])
|
||||
self.logger.error(f'dump memory tracer collected information to a {filename}', ranks=[0])
|
||||
if gpc.get_global_rank() == 0:
|
||||
with open(filename, 'w+') as f:
|
||||
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
|
||||
|
@ -385,7 +385,7 @@ class ShardedModelV2(nn.Module):
|
|||
# make parameters point to gradient
|
||||
|
||||
assert param.colo_attr.saved_grad.is_null(
|
||||
), 'Gradien accumulation is not supported when reuse_fp16_shard=True'
|
||||
), 'Gradient accumulation is not supported when reuse_fp16_shard=True'
|
||||
|
||||
param.colo_attr.grad_payload_reset(grad.data)
|
||||
# release the memory of param
|
||||
|
|
|
@ -261,7 +261,7 @@ def sync_param(flat_tensor, tensor_list):
|
|||
share the same memory space. This function will update the tensor list so that
|
||||
they point to the same value.
|
||||
|
||||
:param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor lsit
|
||||
:param flat_tensor: A flat tensor obtained by calling `torch._utils._unflatten_dense_tensors` on a tensor list
|
||||
:param tensor_list: A list of tensors corresponding to the flattened tensor
|
||||
:type flat_tensor: torch.Tensor
|
||||
:type tensor_list: List[torch.Tensor]
|
||||
|
|
|
@ -207,8 +207,8 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
|
|||
for param in self._working_param_groups[group_id]:
|
||||
self._param_store.set_param_reduction_state(param, False)
|
||||
|
||||
# intialize communication stream for
|
||||
# communication-compuation overlapping
|
||||
# initialize communication stream for
|
||||
# communication-computation overlapping
|
||||
if self._overlap_communication:
|
||||
self._comm_stream = torch.cuda.Stream()
|
||||
|
||||
|
@ -269,7 +269,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
|
|||
params_per_rank = [[] for _ in range(self._world_size)]
|
||||
numel_per_rank = [0 for _ in range(self._world_size)]
|
||||
|
||||
# partititon the parameters in a greedy fashion
|
||||
# partition the parameters in a greedy fashion
|
||||
sorted_params = sorted(param_list, key=lambda x: x.numel(), reverse=True)
|
||||
for param in sorted_params:
|
||||
# allocate this parameter to the rank with
|
||||
|
@ -297,7 +297,7 @@ class LowLevelZeroOptimizer(ColossalaiOptimizer):
|
|||
param_group = self._working_param_groups[group_id]
|
||||
for param in param_group:
|
||||
if param.requires_grad:
|
||||
# determines the reduction destionation rank
|
||||
# determines the reduction destination rank
|
||||
# this is only valid for stage 2
|
||||
# dst_rank = None means using all-reduce
|
||||
# else using reduce
|
||||
|
|
Loading…
Reference in New Issue