mirror of https://github.com/InternLM/InternLM
commit
ad0cddce66
|
@ -7,6 +7,7 @@ from .parallel_context import (
|
||||||
from .process_group_initializer import (
|
from .process_group_initializer import (
|
||||||
Initializer_Data,
|
Initializer_Data,
|
||||||
Initializer_Model,
|
Initializer_Model,
|
||||||
|
Initializer_Nettest,
|
||||||
Initializer_Pipeline,
|
Initializer_Pipeline,
|
||||||
Initializer_Tensor,
|
Initializer_Tensor,
|
||||||
Initializer_Zero1,
|
Initializer_Zero1,
|
||||||
|
@ -34,6 +35,7 @@ __all__ = [
|
||||||
"Initializer_Pipeline",
|
"Initializer_Pipeline",
|
||||||
"Initializer_Data",
|
"Initializer_Data",
|
||||||
"Initializer_Zero1",
|
"Initializer_Zero1",
|
||||||
|
"Initializer_Nettest",
|
||||||
"ProcessGroupInitializer",
|
"ProcessGroupInitializer",
|
||||||
"Initializer_Model",
|
"Initializer_Model",
|
||||||
"seed",
|
"seed",
|
||||||
|
|
|
@ -143,6 +143,7 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
self.pipeline_parallel_size = 1
|
self.pipeline_parallel_size = 1
|
||||||
self.tensor_parallel_size = 1
|
self.tensor_parallel_size = 1
|
||||||
self.zero1_parallel_size = -1
|
self.zero1_parallel_size = -1
|
||||||
|
self.nettest_parallel_size = 1
|
||||||
self.num_processes_on_current_node = -1
|
self.num_processes_on_current_node = -1
|
||||||
self.virtual_pipeline_parallel_size = None
|
self.virtual_pipeline_parallel_size = None
|
||||||
self.virtual_pipeline_parallel_rank = None
|
self.virtual_pipeline_parallel_rank = None
|
||||||
|
@ -442,6 +443,9 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
# instead, it should be calculated based on other parallel config
|
# instead, it should be calculated based on other parallel config
|
||||||
self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
|
self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
|
||||||
|
|
||||||
|
# the recommended nettest_parallel_size is 32 GPUs
|
||||||
|
self.nettest_parallel_size = 32
|
||||||
|
|
||||||
if self.zero1_parallel_size <= 0:
|
if self.zero1_parallel_size <= 0:
|
||||||
self.zero1_parallel_size = self.data_parallel_size
|
self.zero1_parallel_size = self.data_parallel_size
|
||||||
|
|
||||||
|
@ -454,6 +458,7 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
self.pipeline_parallel_size,
|
self.pipeline_parallel_size,
|
||||||
self.tensor_parallel_size,
|
self.tensor_parallel_size,
|
||||||
self.zero1_parallel_size,
|
self.zero1_parallel_size,
|
||||||
|
self.nettest_parallel_size,
|
||||||
]
|
]
|
||||||
|
|
||||||
# run initialization of different process groups
|
# run initialization of different process groups
|
||||||
|
@ -462,6 +467,7 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
|
||||||
initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
|
||||||
initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
|
||||||
|
initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
|
||||||
if self.pipeline_parallel_size > 1:
|
if self.pipeline_parallel_size > 1:
|
||||||
initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
|
||||||
for initializer in initializers:
|
for initializer in initializers:
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
|
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
|
||||||
|
|
||||||
|
import math
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
|
@ -31,6 +32,9 @@ class ParallelMode(Enum):
|
||||||
# zero1 parallel
|
# zero1 parallel
|
||||||
ZERO1 = "zero1"
|
ZERO1 = "zero1"
|
||||||
|
|
||||||
|
# runntime network test
|
||||||
|
NETTEST = "nettest"
|
||||||
|
|
||||||
|
|
||||||
class ProcessGroupInitializer(ABC):
|
class ProcessGroupInitializer(ABC):
|
||||||
"""An object, knowing the parallelism configuration, that initializes parallel groups.
|
"""An object, knowing the parallelism configuration, that initializes parallel groups.
|
||||||
|
@ -52,6 +56,7 @@ class ProcessGroupInitializer(ABC):
|
||||||
pipeline_parallel_size: int,
|
pipeline_parallel_size: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
zero1_parallel_size: int,
|
zero1_parallel_size: int,
|
||||||
|
nettest_parallel_size: int,
|
||||||
):
|
):
|
||||||
self.rank = rank
|
self.rank = rank
|
||||||
self.world_size = world_size
|
self.world_size = world_size
|
||||||
|
@ -59,6 +64,7 @@ class ProcessGroupInitializer(ABC):
|
||||||
self.pipeline_parallel_size = pipeline_parallel_size
|
self.pipeline_parallel_size = pipeline_parallel_size
|
||||||
self.tensor_parallel_size = tensor_parallel_size
|
self.tensor_parallel_size = tensor_parallel_size
|
||||||
self.zero1_parallel_size = zero1_parallel_size
|
self.zero1_parallel_size = zero1_parallel_size
|
||||||
|
self.nettest_parallel_size = nettest_parallel_size
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -332,3 +338,52 @@ class Initializer_Zero1(ProcessGroupInitializer):
|
||||||
ranks_in_group = ranks
|
ranks_in_group = ranks
|
||||||
|
|
||||||
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
||||||
|
|
||||||
|
|
||||||
|
class Initializer_Nettest(ProcessGroupInitializer):
|
||||||
|
"""A ProcessGroupInitializer for network test, especailly for NCCL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rank (int): The rank of current process.
|
||||||
|
world_size (int): Size of whole communication world.
|
||||||
|
nettest_parallel_size (int): Size of a network test group.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
|
||||||
|
|
||||||
|
def init_dist_group(self, use_cpu: bool = False):
|
||||||
|
"""Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||||
|
A Tensor parallelism's information tuple.
|
||||||
|
"""
|
||||||
|
local_rank = None
|
||||||
|
ranks_in_group = None
|
||||||
|
process_group = None
|
||||||
|
cpu_group = None
|
||||||
|
group_world_size = None
|
||||||
|
mode = ParallelMode.NETTEST
|
||||||
|
|
||||||
|
for i in range(self.num_nettest_group):
|
||||||
|
ranks = []
|
||||||
|
for j in range(self.nettest_parallel_size):
|
||||||
|
rank = i * self.nettest_parallel_size + j
|
||||||
|
if rank < self.world_size:
|
||||||
|
ranks.append(rank)
|
||||||
|
group = dist.new_group(ranks)
|
||||||
|
if use_cpu:
|
||||||
|
group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
|
||||||
|
else:
|
||||||
|
group_cpu = None
|
||||||
|
|
||||||
|
if self.rank in ranks:
|
||||||
|
local_rank = ranks.index(self.rank)
|
||||||
|
group_world_size = len(ranks)
|
||||||
|
process_group = group
|
||||||
|
cpu_group = group_cpu
|
||||||
|
ranks_in_group = ranks
|
||||||
|
|
||||||
|
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
||||||
|
|
|
@ -497,6 +497,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
grads = [self.padding_grad]
|
grads = [self.padding_grad]
|
||||||
params = [self.padding_tensor]
|
params = [self.padding_tensor]
|
||||||
|
|
||||||
|
norm = 0
|
||||||
if self._clip_grad_norm > 0:
|
if self._clip_grad_norm > 0:
|
||||||
# this norm is before scaling, it will be very large
|
# this norm is before scaling, it will be very large
|
||||||
norm = compute_norm(
|
norm = compute_norm(
|
||||||
|
@ -542,16 +543,16 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
self._param_store.clear_grads_of_previous_reduced_params()
|
self._param_store.clear_grads_of_previous_reduced_params()
|
||||||
|
|
||||||
# compute norm for gradients in the last bucket
|
# compute norm for gradients in the last bucket
|
||||||
total_norms = []
|
total_norms = {}
|
||||||
for group_id in range(self.num_param_groups):
|
for group_id in range(self.num_param_groups):
|
||||||
total_norms.append(
|
group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
|
||||||
self._compute_norm_with_stage(
|
group_name = f"{group_id}_{group_name}"
|
||||||
|
total_norms[group_name] = self._compute_norm_with_stage(
|
||||||
group_id=group_id,
|
group_id=group_id,
|
||||||
last_bucket=True,
|
last_bucket=True,
|
||||||
last_stage=True,
|
last_stage=True,
|
||||||
previous_norm=groups_norms[group_id],
|
previous_norm=groups_norms[group_id],
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
timer("sync_grad").start()
|
timer("sync_grad").start()
|
||||||
self._sync_grad()
|
self._sync_grad()
|
||||||
|
@ -569,7 +570,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
# found_inf = self._check_overflow()
|
# found_inf = self._check_overflow()
|
||||||
# Because you may encounter inf when computing norm
|
# Because you may encounter inf when computing norm
|
||||||
|
|
||||||
if -1 in norms:
|
if -1 in norms.values():
|
||||||
found_inf = True
|
found_inf = True
|
||||||
|
|
||||||
loss_scale = float(self.loss_scale.item()) # backup
|
loss_scale = float(self.loss_scale.item()) # backup
|
||||||
|
@ -617,15 +618,17 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# unscale and clip grads
|
# unscale and clip grads
|
||||||
# get the global norm
|
# get the global norm
|
||||||
global_norm_groups = []
|
global_norm_groups = {}
|
||||||
if self._clip_grad_norm > 0:
|
if self._clip_grad_norm > 0:
|
||||||
for norm in norms:
|
for group_name, norm in norms.items():
|
||||||
global_norm_groups.append(norm**0.5)
|
global_norm_groups[group_name] = norm**0.5
|
||||||
|
|
||||||
# the following operations are performed only on the rank to which parameters are assigned.
|
# the following operations are performed only on the rank to which parameters are assigned.
|
||||||
if gpc.config.model.dtype is not torch.float32:
|
if gpc.config.model.dtype is not torch.float32:
|
||||||
if len(single_grad_partition_groups) != 0:
|
if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
|
||||||
self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
|
self._unscale_and_clip_grads(
|
||||||
|
single_grad_partition_groups, list(global_norm_groups.values()), loss_scale
|
||||||
|
)
|
||||||
|
|
||||||
# update the parameters
|
# update the parameters
|
||||||
timer("step").start()
|
timer("step").start()
|
||||||
|
@ -652,7 +655,9 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# update gradients may not be needed here, because the sync_params function is used in initialization,
|
# update gradients may not be needed here, because the sync_params function is used in initialization,
|
||||||
# so synchronization is maintained
|
# so synchronization is maintained
|
||||||
return True, [global_norm / loss_scale for global_norm in global_norm_groups]
|
for group_name, global_norm in global_norm_groups.items():
|
||||||
|
global_norm_groups[group_name] = global_norm / loss_scale
|
||||||
|
return True, global_norm_groups
|
||||||
|
|
||||||
def broadcast_params(self):
|
def broadcast_params(self):
|
||||||
handles = []
|
handles = []
|
||||||
|
|
|
@ -389,23 +389,31 @@ def record_current_batch_training_metrics(
|
||||||
line = ""
|
line = ""
|
||||||
for key, value in infos.items():
|
for key, value in infos.items():
|
||||||
line += f"{key}={value} "
|
line += f"{key}={value} "
|
||||||
|
if isinstance(value, dict):
|
||||||
|
writer.add_scalars(key=key, value=value, step=train_state.step_count)
|
||||||
|
else:
|
||||||
writer.add_scalar(key=key, value=value, step=train_state.step_count)
|
writer.add_scalar(key=key, value=value, step=train_state.step_count)
|
||||||
|
|
||||||
if update_panel:
|
if update_panel:
|
||||||
logger.info(
|
# metrics shown with dashboard panels
|
||||||
line,
|
panel_metrics = {
|
||||||
extra={
|
|
||||||
"step": batch_count,
|
"step": batch_count,
|
||||||
"lr": lr,
|
"lr": lr,
|
||||||
"num_consumed_tokens": train_state.num_consumed_tokens,
|
"num_consumed_tokens": train_state.num_consumed_tokens,
|
||||||
"grad_norm": grad_norm,
|
|
||||||
"loss": loss.item(),
|
"loss": loss.item(),
|
||||||
"flops": tflops,
|
"flops": tflops,
|
||||||
"tgs": tk_per_gpu,
|
"tgs": tk_per_gpu,
|
||||||
"acc": acc_perplex["acc"],
|
"acc": acc_perplex["acc"],
|
||||||
"perplexity": acc_perplex["perplexity"],
|
"perplexity": acc_perplex["perplexity"],
|
||||||
"fwd_bwd_time": fwd_bwd_time,
|
"fwd_bwd_time": fwd_bwd_time,
|
||||||
},
|
}
|
||||||
|
for norm_key, norm_value in grad_norm.items():
|
||||||
|
panel_metrics[norm_key] = norm_value
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"{line}",
|
||||||
|
line=line,
|
||||||
|
extra=panel_metrics,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(line)
|
logger.info(line)
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
import math
|
||||||
|
import socket
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
from flash_attn.modules.mha import FlashSelfAttention, SelfAttention
|
||||||
|
from torch.utils import benchmark
|
||||||
|
|
||||||
|
from internlm.utils.logger import get_logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
import GPUtil
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
GPUtil, psutil = None, None
|
||||||
|
|
||||||
|
from internlm.core.context import ParallelMode
|
||||||
|
from internlm.core.context import global_context as gpc
|
||||||
|
from internlm.utils.common import get_current_device
|
||||||
|
|
||||||
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_forward(
|
||||||
|
test_fn,
|
||||||
|
*inputs,
|
||||||
|
repeats=100,
|
||||||
|
amp=True,
|
||||||
|
amp_dtype=torch.float16,
|
||||||
|
**kwinputs,
|
||||||
|
):
|
||||||
|
"""Use Pytorch Benchmark on the forward pass of an arbitrary function."""
|
||||||
|
|
||||||
|
def amp_wrapper(*inputs, **kwinputs):
|
||||||
|
with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
|
||||||
|
test_fn(*inputs, **kwinputs)
|
||||||
|
|
||||||
|
bench_timer = benchmark.Timer(
|
||||||
|
stmt="test_fn_amp(*inputs, **kwinputs)",
|
||||||
|
globals={"test_fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
|
||||||
|
num_threads=torch.get_num_threads(),
|
||||||
|
)
|
||||||
|
used_time = bench_timer.timeit(repeats)
|
||||||
|
return used_time.mean
|
||||||
|
|
||||||
|
|
||||||
|
def flops(batch, seqlen, headdim, nheads, time_f):
|
||||||
|
"""Compute the flops value of a GPU with give flashattention function"""
|
||||||
|
|
||||||
|
flop = 4 * batch * seqlen**2 * nheads * headdim
|
||||||
|
return (flop / time_f / 10**12) if not math.isnan(time_f) else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_temperature():
|
||||||
|
"""Get current GPU temperature."""
|
||||||
|
try:
|
||||||
|
gpu_id = torch.cuda.current_device()
|
||||||
|
except AssertionError:
|
||||||
|
gpu_id = -1
|
||||||
|
|
||||||
|
if GPUtil is not None and gpu_id >= 0:
|
||||||
|
gpus = GPUtil.getGPUs()
|
||||||
|
gpu_temperature = gpus[gpu_id].temperature
|
||||||
|
else:
|
||||||
|
gpu_temperature = -1
|
||||||
|
|
||||||
|
return gpu_temperature
|
||||||
|
|
||||||
|
|
||||||
|
def get_cpu_temperature():
|
||||||
|
"""Get current CPU temperature."""
|
||||||
|
|
||||||
|
if psutil is not None:
|
||||||
|
cpu_temperature = psutil.sensors_temperatures()["coretemp"][0].current
|
||||||
|
else:
|
||||||
|
cpu_temperature = -1
|
||||||
|
|
||||||
|
return cpu_temperature
|
||||||
|
|
||||||
|
|
||||||
|
def bench_net():
|
||||||
|
"""Benchmark nccl performance for slow node detection."""
|
||||||
|
|
||||||
|
if gpc.get_world_size(ParallelMode.GLOBAL) <= 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info("benchmarking network speed ...")
|
||||||
|
|
||||||
|
repeats = 100
|
||||||
|
input_data = torch.randn(
|
||||||
|
8 * 1024 * 1024,
|
||||||
|
device=get_current_device(),
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
def allreduce_fn(inputs):
|
||||||
|
dist.all_reduce(inputs, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.NETTEST))
|
||||||
|
|
||||||
|
bench_timer = benchmark.Timer(
|
||||||
|
stmt="test_fn_amp(inputs)",
|
||||||
|
globals={"test_fn_amp": allreduce_fn, "inputs": input_data},
|
||||||
|
num_threads=torch.get_num_threads(),
|
||||||
|
)
|
||||||
|
allreduce_time = bench_timer.timeit(repeats).mean
|
||||||
|
allreduce_time = allreduce_time * 10**3
|
||||||
|
allreduce_time_this = allreduce_time
|
||||||
|
allreduce_time = torch.Tensor([allreduce_time]).to(device=get_current_device())
|
||||||
|
dist.all_reduce(allreduce_time, group=gpc.get_group(ParallelMode.GLOBAL))
|
||||||
|
allreduce_time_avg = allreduce_time / gpc.get_world_size(ParallelMode.GLOBAL)
|
||||||
|
allreduce_time_avg = float(allreduce_time_avg.item())
|
||||||
|
|
||||||
|
if allreduce_time_this >= allreduce_time_avg * 1.05:
|
||||||
|
logger.warning(
|
||||||
|
f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} NCCL test is slower than avg, "
|
||||||
|
f"Hostname {socket.gethostname()}, "
|
||||||
|
f"allreduce_time {allreduce_time_this:.2f}, avg {allreduce_time_avg:.2f}, "
|
||||||
|
f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_gpu(use_flash_attn=True):
|
||||||
|
"""Benchmark single GPU performance for slow node detection."""
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info("benchmarking gpu speed ...")
|
||||||
|
|
||||||
|
headdim = 64
|
||||||
|
dim = 2048
|
||||||
|
batch_size, seqlen = 2, 1024
|
||||||
|
nheads = dim // headdim
|
||||||
|
|
||||||
|
inner_attn = FlashSelfAttention if use_flash_attn else SelfAttention
|
||||||
|
inner_attn = inner_attn(causal=True, softmax_scale=None, attention_dropout=0)
|
||||||
|
|
||||||
|
qkv = torch.randn(
|
||||||
|
batch_size,
|
||||||
|
seqlen,
|
||||||
|
3,
|
||||||
|
dim // headdim,
|
||||||
|
headdim,
|
||||||
|
device=get_current_device(),
|
||||||
|
dtype=torch.float16,
|
||||||
|
requires_grad=True,
|
||||||
|
)
|
||||||
|
time_f = benchmark_forward(inner_attn, qkv)
|
||||||
|
speed = flops(batch_size, seqlen, headdim, nheads, time_f)
|
||||||
|
speed_this = speed
|
||||||
|
speed = torch.Tensor([speed]).to(device=get_current_device())
|
||||||
|
dist.all_reduce(speed, group=gpc.get_group(ParallelMode.GLOBAL))
|
||||||
|
speed_avg = speed / gpc.get_world_size(ParallelMode.GLOBAL)
|
||||||
|
speed_avg = float(speed_avg.item())
|
||||||
|
|
||||||
|
if speed_this <= speed_avg * 0.95:
|
||||||
|
logger.warning(
|
||||||
|
f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} GPU is slower than avg, "
|
||||||
|
f"Hostname {socket.gethostname()}, "
|
||||||
|
f"tflops {speed_this:.2f}, avg {speed_avg:.2f}, "
|
||||||
|
f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
|
||||||
|
)
|
|
@ -134,6 +134,14 @@ class Writer:
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def add_scalars(self, key, value, step):
|
||||||
|
try:
|
||||||
|
assert isinstance(value, dict)
|
||||||
|
if self.enable_tb and self.tb_writer is not None:
|
||||||
|
self.tb_writer.add_scalars(main_tag=key, tag_scalar_dict=value, global_step=step)
|
||||||
|
except Exception:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
def add_text(self, key, value, step):
|
def add_text(self, key, value, step):
|
||||||
try:
|
try:
|
||||||
if self.enable_tb and self.tb_writer is not None:
|
if self.enable_tb and self.tb_writer is not None:
|
||||||
|
|
8
train.py
8
train.py
|
@ -6,7 +6,6 @@ import time
|
||||||
import traceback
|
import traceback
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
|
@ -36,6 +35,7 @@ from internlm.utils.common import (
|
||||||
parse_args,
|
parse_args,
|
||||||
)
|
)
|
||||||
from internlm.utils.evaluation import evaluate_on_val_dls
|
from internlm.utils.evaluation import evaluate_on_val_dls
|
||||||
|
from internlm.utils.gputest import bench_gpu, bench_net
|
||||||
from internlm.utils.logger import get_logger, initialize_uniscale_logger
|
from internlm.utils.logger import get_logger, initialize_uniscale_logger
|
||||||
from internlm.utils.megatron_timers import megatron_timer as timer
|
from internlm.utils.megatron_timers import megatron_timer as timer
|
||||||
from internlm.utils.model_checkpoint import CheckpointManager
|
from internlm.utils.model_checkpoint import CheckpointManager
|
||||||
|
@ -197,6 +197,8 @@ def main(args):
|
||||||
for batch_count in range(train_state.batch_count, total_steps):
|
for batch_count in range(train_state.batch_count, total_steps):
|
||||||
if batch_count % 50 == 0:
|
if batch_count % 50 == 0:
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
bench_gpu()
|
||||||
|
bench_net()
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
timer("one-batch").start()
|
timer("one-batch").start()
|
||||||
|
@ -236,7 +238,7 @@ def main(args):
|
||||||
train_state.step_count += 1
|
train_state.step_count += 1
|
||||||
else:
|
else:
|
||||||
train_state.inf_nan_skip_batches += 1 # record the amount of updating parameters unsuccessfully.
|
train_state.inf_nan_skip_batches += 1 # record the amount of updating parameters unsuccessfully.
|
||||||
if -1 in grad_norm_groups and gpc.is_rank_for_log(): # -1 encodes a specific failure case
|
if -1 in grad_norm_groups.values() and gpc.is_rank_for_log(): # -1 encodes a specific failure case
|
||||||
logger.warning(f"Warning: skip parameter update at step {batch_count}.")
|
logger.warning(f"Warning: skip parameter update at step {batch_count}.")
|
||||||
send_alert_message(
|
send_alert_message(
|
||||||
address=gpc.config.alert_address,
|
address=gpc.config.alert_address,
|
||||||
|
@ -257,7 +259,7 @@ def main(args):
|
||||||
trainer=trainer,
|
trainer=trainer,
|
||||||
start_time=start_time,
|
start_time=start_time,
|
||||||
loss=loss,
|
loss=loss,
|
||||||
grad_norm=np.array(grad_norm_groups),
|
grad_norm=grad_norm_groups,
|
||||||
metric=metric,
|
metric=metric,
|
||||||
update_panel=uniscale_logger is not None,
|
update_panel=uniscale_logger is not None,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue