rm debug log

pull/412/head
JiaoPL 2023-10-13 12:25:46 +08:00
parent f2358b9432
commit 646f1b45fa
1 changed files with 2 additions and 8 deletions

View File

@ -317,6 +317,8 @@ def compute_norm(
if gpc.is_initialized(ParallelMode.MODEL): if gpc.is_initialized(ParallelMode.MODEL):
dist.all_reduce(total_layer_norms_values, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL)) dist.all_reduce(total_layer_norms_values, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL))
# This is because we use zero1, so we need to use this reduction.
# TODO: Check zero group to be a subset of dp group.
dist.all_reduce(total_layer_norms_values, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode)) dist.all_reduce(total_layer_norms_values, op=dist.ReduceOp.SUM, group=gpc.get_group(zero_mode))
for idx, layer_name in enumerate(total_layer_norms.keys()): for idx, layer_name in enumerate(total_layer_norms.keys()):
@ -521,9 +523,6 @@ class ParamBcastSyncHandler:
for _chunk in model: for _chunk in model:
if isinstance(_chunk, NaiveAMPModel): if isinstance(_chunk, NaiveAMPModel):
_chunk = _chunk.model _chunk = _chunk.model
# if gpc.is_rank_for_log():
# logger.info(_chunk)
# [ name for name , _ in model.model.named_children()]
for name, children in _chunk.named_children(): for name, children in _chunk.named_children():
# should be the transformer block definaton in modeling_xxx.py # should be the transformer block definaton in modeling_xxx.py
if isinstance(children, nn.ModuleList): if isinstance(children, nn.ModuleList):
@ -533,8 +532,6 @@ class ParamBcastSyncHandler:
self._block_to_param[block] = list(block.parameters()) self._block_to_param[block] = list(block.parameters())
for parameter in self._block_to_param[block]: for parameter in self._block_to_param[block]:
layer_name = f"{block.__class__.__name__}.{idx}" layer_name = f"{block.__class__.__name__}.{idx}"
# if gpc.is_rank_for_log():
# logger.info(layer_name)
global_layer_norms[layer_name] = 0.0 global_layer_norms[layer_name] = 0.0
parameter.__setattr__("layer_name", layer_name) parameter.__setattr__("layer_name", layer_name)
else: else:
@ -543,9 +540,6 @@ class ParamBcastSyncHandler:
self._block_to_param[children] = list(children.parameters()) self._block_to_param[children] = list(children.parameters())
for parameter in self._block_to_param[children]: for parameter in self._block_to_param[children]:
layer_name = f"{children.__class__.__name__}" layer_name = f"{children.__class__.__name__}"
# if gpc.is_rank_for_log():
# logger.info(layer_name)
# global_layer_norms[layer_name] = 0.0
parameter.__setattr__("layer_name", name) parameter.__setattr__("layer_name", name)
alloc_num = 0 alloc_num = 0