mirror of https://github.com/InternLM/InternLM
fix(fsdp_optimizer.py): wait grad async
parent
c5a7e76ada
commit
056996f8b3
|
@ -100,7 +100,8 @@ class FSDPadaptOptimizer(BaseOptimizer):
|
||||||
params = self._fp16_param_groups[group_idx]
|
params = self._fp16_param_groups[group_idx]
|
||||||
for param in params:
|
for param in params:
|
||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP)
|
handle = reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP)
|
||||||
|
handle.wait()
|
||||||
|
|
||||||
# compute norm
|
# compute norm
|
||||||
found_inf = False
|
found_inf = False
|
||||||
|
|
|
@ -309,7 +309,7 @@ def load_model_checkpoint(folder, model):
|
||||||
"""
|
"""
|
||||||
There should be weights with names similar to the following under the folder.
|
There should be weights with names similar to the following under the folder.
|
||||||
- folder
|
- folder
|
||||||
- model_tp{tp_rank}_pp{pp_rank}.pt\
|
- model_tp{tp_rank}_pp{pp_rank}.pt
|
||||||
|
|
||||||
If fsdp is activated, the saved weight is named:
|
If fsdp is activated, the saved weight is named:
|
||||||
- folder
|
- folder
|
||||||
|
@ -345,15 +345,16 @@ def load_model_checkpoint(folder, model):
|
||||||
max_pp = max(max_pp, int(segements[-1][2:]))
|
max_pp = max(max_pp, int(segements[-1][2:]))
|
||||||
max_tp = max(max_tp, int(segements[-2][2:]))
|
max_tp = max(max_tp, int(segements[-2][2:]))
|
||||||
|
|
||||||
assert (
|
|
||||||
zo_size == max_zo + 1
|
|
||||||
), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards"
|
|
||||||
assert (
|
assert (
|
||||||
pp_size == max_pp + 1
|
pp_size == max_pp + 1
|
||||||
), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
|
), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
|
||||||
assert (
|
assert (
|
||||||
tp_size == max_tp + 1
|
tp_size == max_tp + 1
|
||||||
), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
|
), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
|
||||||
|
if gpc.config.parallel.use_fsdp:
|
||||||
|
assert (
|
||||||
|
zo_size == max_zo + 1
|
||||||
|
), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards"
|
||||||
|
|
||||||
if gpc.config.parallel.use_fsdp:
|
if gpc.config.parallel.use_fsdp:
|
||||||
should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt"
|
should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt"
|
||||||
|
|
Loading…
Reference in New Issue