mirror of https://github.com/InternLM/InternLM
parent
607f691e16
commit
794a484666
|
@ -4,6 +4,7 @@
|
||||||
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
|
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
from collections import deque
|
||||||
from typing import Iterable, Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
from internlm.core.engine import Engine
|
from internlm.core.engine import Engine
|
||||||
|
@ -58,6 +59,24 @@ class TrainState:
|
||||||
if batch_sampler:
|
if batch_sampler:
|
||||||
self.init_batch_sampler(batch_sampler)
|
self.init_batch_sampler(batch_sampler)
|
||||||
|
|
||||||
|
# tgs statistic
|
||||||
|
self.tgs_statistic = {
|
||||||
|
"sum_step": 0,
|
||||||
|
"sum_tg": 0,
|
||||||
|
"sum_time": 0,
|
||||||
|
"sum_last_tg_10": 0,
|
||||||
|
"sum_last_time_10": 0,
|
||||||
|
"sum_last_tg_50": 0,
|
||||||
|
"sum_last_time_50": 0,
|
||||||
|
"SMA_tg_50": 0,
|
||||||
|
"SMA_time_50": 0,
|
||||||
|
"SMA_tg_50_list": deque(),
|
||||||
|
"SMA_time_50_list": deque(),
|
||||||
|
"sum_tgs": 0,
|
||||||
|
"last_tgs_10": 0,
|
||||||
|
"last_tgs_50": 0,
|
||||||
|
}
|
||||||
|
|
||||||
def init_batch_sampler(self, batch_sampler):
|
def init_batch_sampler(self, batch_sampler):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
|
|
@ -372,9 +372,52 @@ def record_current_batch_training_metrics(
|
||||||
max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
|
max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
|
||||||
max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
||||||
min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
||||||
|
time_cost = time.time() - start_time
|
||||||
tk_per_gpu = 0
|
|
||||||
tk_per_gpu = round(
|
tk_per_gpu = round(
|
||||||
|
num_tokens_in_batch * gpc.get_world_size(ParallelMode.DATA) / gpc.get_world_size(ParallelMode.GLOBAL),
|
||||||
|
4,
|
||||||
|
)
|
||||||
|
tgs_statistic = train_state.tgs_statistic
|
||||||
|
tgs_statistic["sum_step"] += 1
|
||||||
|
tgs_statistic["sum_tg"] += tk_per_gpu
|
||||||
|
tgs_statistic["sum_time"] += time_cost
|
||||||
|
tgs_statistic["sum_last_tg_10"] += tk_per_gpu
|
||||||
|
tgs_statistic["sum_last_time_10"] += time_cost
|
||||||
|
tgs_statistic["sum_last_tg_50"] += tk_per_gpu
|
||||||
|
tgs_statistic["sum_last_time_50"] += time_cost
|
||||||
|
tgs_statistic["SMA_tg_50"] += tk_per_gpu
|
||||||
|
tgs_statistic["SMA_time_50"] += time_cost
|
||||||
|
tgs_statistic["SMA_tg_50_list"].append(tk_per_gpu)
|
||||||
|
tgs_statistic["SMA_time_50_list"].append(time_cost)
|
||||||
|
if tgs_statistic["sum_step"] > 50:
|
||||||
|
tgs_statistic["SMA_tg_50"] -= tgs_statistic["SMA_tg_50_list"][0]
|
||||||
|
tgs_statistic["SMA_time_50"] -= tgs_statistic["SMA_time_50_list"][0]
|
||||||
|
tgs_statistic["SMA_tg_50_list"].popleft()
|
||||||
|
tgs_statistic["SMA_time_50_list"].popleft()
|
||||||
|
|
||||||
|
last_tgs_1 = round(tk_per_gpu / time_cost, 2)
|
||||||
|
tgs_statistic["sum_tgs"] += last_tgs_1
|
||||||
|
|
||||||
|
if tgs_statistic["sum_step"] % 10 == 0:
|
||||||
|
tgs_statistic["last_tgs_10"] = round(tgs_statistic["sum_last_tg_10"] / tgs_statistic["sum_last_time_10"], 2)
|
||||||
|
tgs_statistic["sum_last_tg_10"] = 0
|
||||||
|
tgs_statistic["sum_last_time_10"] = 0
|
||||||
|
|
||||||
|
if tgs_statistic["sum_step"] % 50 == 0:
|
||||||
|
tgs_statistic["last_tgs_50"] = round(tgs_statistic["sum_last_tg_50"] / tgs_statistic["sum_last_time_50"], 2)
|
||||||
|
tgs_statistic["sum_last_tg_50"] = 0
|
||||||
|
tgs_statistic["sum_last_time_50"] = 0
|
||||||
|
|
||||||
|
last_tgs_10 = tgs_statistic["last_tgs_10"]
|
||||||
|
last_tgs_50 = tgs_statistic["last_tgs_50"]
|
||||||
|
|
||||||
|
tgs_all = round(tgs_statistic["sum_tg"] / tgs_statistic["sum_time"], 2)
|
||||||
|
tgs_avg = round(tgs_statistic["sum_tgs"] / tgs_statistic["sum_step"], 2)
|
||||||
|
tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2)
|
||||||
|
|
||||||
|
tflops = get_tflops_func((time.time() - start_time))
|
||||||
|
|
||||||
|
tgs_origin = round(
|
||||||
num_tokens_in_batch
|
num_tokens_in_batch
|
||||||
* gpc.get_world_size(ParallelMode.DATA)
|
* gpc.get_world_size(ParallelMode.DATA)
|
||||||
/ gpc.get_world_size(ParallelMode.GLOBAL)
|
/ gpc.get_world_size(ParallelMode.GLOBAL)
|
||||||
|
@ -382,13 +425,17 @@ def record_current_batch_training_metrics(
|
||||||
2,
|
2,
|
||||||
)
|
)
|
||||||
|
|
||||||
tflops = get_tflops_func((time.time() - start_time))
|
|
||||||
|
|
||||||
infos = {
|
infos = {
|
||||||
"tflops": tflops,
|
"tflops": tflops,
|
||||||
"step": batch_count,
|
"step": batch_count,
|
||||||
"loss": loss.item(),
|
"loss": loss.item(),
|
||||||
"tgs (tokens/gpu/second)": tk_per_gpu,
|
"tgs (tokens/gpu/second)": tgs_origin,
|
||||||
|
"tgs/last_tgs_1": last_tgs_1,
|
||||||
|
"tgs/tgs_all": tgs_all,
|
||||||
|
"tgs/tgs_avg": tgs_avg,
|
||||||
|
"tgs/tgs_SMA": tgs_SMA,
|
||||||
|
"tgs/last_tgs_10": last_tgs_10,
|
||||||
|
"tgs/last_tgs_50": last_tgs_50,
|
||||||
"lr": lr,
|
"lr": lr,
|
||||||
"loss_scale": scaler,
|
"loss_scale": scaler,
|
||||||
"grad_norm": grad_norm,
|
"grad_norm": grad_norm,
|
||||||
|
@ -428,7 +475,7 @@ def record_current_batch_training_metrics(
|
||||||
"num_consumed_tokens": train_state.num_consumed_tokens,
|
"num_consumed_tokens": train_state.num_consumed_tokens,
|
||||||
"loss": loss.item(),
|
"loss": loss.item(),
|
||||||
"flops": tflops,
|
"flops": tflops,
|
||||||
"tgs": tk_per_gpu,
|
"tgs": last_tgs_1,
|
||||||
"acc": acc_perplex["acc"],
|
"acc": acc_perplex["acc"],
|
||||||
"perplexity": acc_perplex["perplexity"],
|
"perplexity": acc_perplex["perplexity"],
|
||||||
"fwd_bwd_time": fwd_bwd_time,
|
"fwd_bwd_time": fwd_bwd_time,
|
||||||
|
|
Loading…
Reference in New Issue