mirror of https://github.com/InternLM/InternLM
fix(metric): add metric dtype control (#533)
* fix(metric): add metric dtype control * fix demo config to avoid implicity * fix default behaviorpull/538/head
parent
6c0ff4820f
commit
cc5b15349d
|
@ -145,18 +145,18 @@ model = dict(
|
||||||
moe_use_residual=False,
|
moe_use_residual=False,
|
||||||
moe_gate_k=2,
|
moe_gate_k=2,
|
||||||
)
|
)
|
||||||
"""
|
|
||||||
zero1 parallel:
|
# zero1 parallel:
|
||||||
1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
|
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
|
||||||
so parameters will be divided within the range of dp.
|
# so parameters will be divided within the range of dp.
|
||||||
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
||||||
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
||||||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
||||||
pipeline parallel (dict):
|
# pipeline parallel (dict):
|
||||||
1. size: int, the size of pipeline parallel.
|
# 1. size: int, the size of pipeline parallel.
|
||||||
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
|
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
|
||||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
||||||
"""
|
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=dict(size=-1, fsdp=False),
|
zero1=dict(size=-1, fsdp=False),
|
||||||
tensor=1,
|
tensor=1,
|
||||||
|
@ -176,4 +176,8 @@ monitor = dict(
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
model_type = "INTERNLM_MoE"
|
model_type = "INTERNLM_MoE"
|
||||||
|
|
||||||
|
# metric_dtype can be "fp32" or other string
|
||||||
|
# only when set to "fp32" will use fp32 to calc in metrics
|
||||||
|
# metric_dtype = "fp32"
|
||||||
|
|
|
@ -146,18 +146,18 @@ model = dict(
|
||||||
use_flash_attn=True,
|
use_flash_attn=True,
|
||||||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
||||||
)
|
)
|
||||||
"""
|
|
||||||
zero1 parallel:
|
# zero1 parallel:
|
||||||
1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
|
# 1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
|
||||||
so parameters will be divided within the range of dp.
|
# so parameters will be divided within the range of dp.
|
||||||
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
# 2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
||||||
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
# 3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
||||||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
# For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
||||||
pipeline parallel (dict):
|
# pipeline parallel (dict):
|
||||||
1. size: int, the size of pipeline parallel.
|
# 1. size: int, the size of pipeline parallel.
|
||||||
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
|
# 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
|
||||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
# tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
||||||
"""
|
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=dict(size=8, fsdp=False),
|
zero1=dict(size=8, fsdp=False),
|
||||||
tensor=1,
|
tensor=1,
|
||||||
|
@ -177,3 +177,7 @@ monitor = dict(
|
||||||
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
|
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# metric_dtype can be "fp32" or other string
|
||||||
|
# only when set to "fp32" will use fp32 to calc in metrics
|
||||||
|
# metric_dtype = "fp32"
|
||||||
|
|
|
@ -26,7 +26,11 @@ class AccPerplex:
|
||||||
self.device = device
|
self.device = device
|
||||||
self.right = torch.Tensor([0]).to(device=device)
|
self.right = torch.Tensor([0]).to(device=device)
|
||||||
self.total = torch.Tensor([0]).to(device=device)
|
self.total = torch.Tensor([0]).to(device=device)
|
||||||
self.total_log_probs = torch.Tensor([0]).to(device=device, dtype=torch.float)
|
self.metric_dtype = torch.float if gpc.config.get("metric_dtype", None) == "fp32" else None
|
||||||
|
if self.metric_dtype is not None:
|
||||||
|
self.total_log_probs = torch.Tensor([0]).to(device=device, dtype=self.metric_dtype)
|
||||||
|
else:
|
||||||
|
self.total_log_probs = torch.Tensor([0]).to(device=device)
|
||||||
self.tp_pg = tp_pg
|
self.tp_pg = tp_pg
|
||||||
self.dp_pg = dp_pg
|
self.dp_pg = dp_pg
|
||||||
self.tp_local_rank = torch.distributed.get_rank(self.tp_pg)
|
self.tp_local_rank = torch.distributed.get_rank(self.tp_pg)
|
||||||
|
@ -128,8 +132,9 @@ class AccPerplex:
|
||||||
# All reduce is needed to get the chunks from other GPUs.
|
# All reduce is needed to get the chunks from other GPUs.
|
||||||
torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
|
torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
|
||||||
|
|
||||||
predicted_logits = predicted_logits.to(dtype=torch.float)
|
if self.metric_dtype is not None:
|
||||||
shift_logits = shift_logits.to(dtype=torch.float)
|
predicted_logits = predicted_logits.to(dtype=self.metric_dtype)
|
||||||
|
shift_logits = shift_logits.to(dtype=self.metric_dtype)
|
||||||
|
|
||||||
pred_exp_logits = torch.exp(predicted_logits)
|
pred_exp_logits = torch.exp(predicted_logits)
|
||||||
# Sum of exponential of logits along vocab dimension across all GPUs.
|
# Sum of exponential of logits along vocab dimension across all GPUs.
|
||||||
|
|
Loading…
Reference in New Issue