InternLM/internlm/model/metrics.py

from typing import List

import torch
from flash_attn.losses.cross_entropy import CrossEntropyLoss as FlashCrossEntropyLoss
from torch_scatter import scatter

from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.utils.parallel import is_no_pp_or_last_stage


class AccPerplex:
    """
    AccPerplex module for calculating model's accuracy and perplexity metrics.

    Args:
        device: The GPU device.
        tp_pg: The tensor parallel process group.
        dp_pg: The data parallel process group.
        tokenizer: For calculating BPB.
        dataset_types (List[str]): Various data types that will be used in the current training process,
            such as ['en', 'cn', 'code']. The order of the List should be consistent with the type_id specified
            in the dataset. Changed parameters need to be used in conjunction with set_current_type_ids().
    """

    def __init__(self, device, tp_pg, dp_pg, tokenizer=None, dataset_types: List[str] = None):
        self.device = device
        self.right = torch.Tensor([0]).to(device=device)
        self.total = torch.Tensor([0]).to(device=device)
        self.total_log_probs = torch.Tensor([0]).to(device=device)
        self.tp_pg = tp_pg
        self.dp_pg = dp_pg
        self.tp_local_rank = torch.distributed.get_rank(self.tp_pg)
        self.tokenizer = tokenizer
        self.total_bytes = torch.Tensor([0]).to(device=device).view(1)
        self.batch_shift = 0
        self.type_ids = None
        if dataset_types is not None:
            self.dataset_types = dataset_types
            self.total_type_count = len(dataset_types)
            self.ds_right = torch.zeros(self.total_type_count, dtype=torch.long, device=device)
            self.ds_tokens = torch.zeros(self.total_type_count, dtype=torch.long, device=device)

        self.loss_with_type_id = LossWithTypeId(device, dp_pg, dataset_types)

    def set_current_type_ids(self, type_ids: torch.Tensor):
        self.batch_shift = 0
        self.type_ids = type_ids.cuda()

    def __call__(self, logits, labels):
        return self.update(logits, labels, type_ids=self.type_ids)

    def update(self, logits, labels, type_ids=None):
        if gpc.config.model.use_flash_attn:
            micro_bsz = labels.size(0)
        else:
            micro_bsz = 1
        if type_ids is not None:
            type_ids = type_ids[self.batch_shift * micro_bsz : (self.batch_shift + 1) * micro_bsz].view(-1)
            self.batch_shift += 1
        self.loss_with_type_id.update(logits, labels, type_ids)

        with torch.no_grad():
            if isinstance(logits, (list, tuple)):
                logits = logits[0]

            logits = logits.detach().clone()
            labels = labels.detach().clone()

            if self.tokenizer:  # need to calculate bits per bytes
                sequences = self.tokenizer.decode_ids(labels.tolist())
                self.total_bytes += sum(map(lambda x: len(x.encode("utf-8")), sequences))

            shift_logits = logits.view(-1, logits.size(-1))
            shift_labels = labels.view(-1)
            # There is a shift according to the current rank, because the logits are split
            pred_shift = self.tp_local_rank * logits.shape[-1]

            logits_max = torch.max(shift_logits, dim=-1)[0]
            torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=self.tp_pg)
            # Determine whether the maximum value of the current local tensor is the global maximum value
            logits_global = logits_max == torch.max(shift_logits, dim=-1)[0]

            corrects = torch.logical_and(
                (shift_labels == (shift_logits.argmax(dim=-1) + pred_shift)), logits_global
            ).long()
            mask = shift_labels.ne(-100).long()
            if hasattr(self, "total_type_count"):
                ds_acc = scatter(corrects, type_ids, dim=0, reduce="sum")
                token_num_type = scatter(mask, type_ids, dim=0, reduce="sum")
                if len(ds_acc) < self.total_type_count:
                    ds_acc = torch.cat([ds_acc, ds_acc.new_zeros(self.total_type_count - len(ds_acc))])
                    token_num_type = torch.cat(
                        [token_num_type, token_num_type.new_zeros(self.total_type_count - len(token_num_type))]
                    )
                self.ds_tokens += token_num_type
                sync_tensor = ds_acc
                torch.distributed.all_reduce(sync_tensor, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
                self.ds_right += sync_tensor.view(-1)

            acc = corrects.sum()
            torch.distributed.all_reduce(acc, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)
            self.right += acc  # Masked_fill is not needed here because -100 is not available anyway
            self.total += mask.sum()

            # Subtract the maximum value.
            shift_logits = shift_logits.sub(logits_max.unsqueeze(dim=-1))

            # Get the partition's vocab indecies
            partition_vocab_size = shift_logits.size()[-1]
            vocab_start_index = partition_vocab_size * self.tp_local_rank
            vocab_end_index = vocab_start_index + partition_vocab_size

            # Create a mask of valid vocab ids (1 means it needs to be masked).
            target_mask = (shift_labels < vocab_start_index) | (shift_labels >= vocab_end_index)
            masked_target = shift_labels - vocab_start_index
            masked_target[target_mask] = 0

            # Get predicted-logits = logits[target].
            # For Simplicity, we convert logits to a 2-D tensor with size
            # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
            logits_2d = shift_logits.view(-1, partition_vocab_size)
            masked_target_1d = masked_target.view(-1)
            arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
            predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
            predicted_logits_1d = predicted_logits_1d.clone().contiguous()
            predicted_logits = predicted_logits_1d.view_as(shift_labels)  # bsz x max_len
            predicted_logits[target_mask] = 0.0
            # All reduce is needed to get the chunks from other GPUs.
            torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)

            pred_exp_logits = torch.exp(predicted_logits)
            # Sum of exponential of logits along vocab dimension across all GPUs.
            sum_exp_logits = torch.exp(shift_logits).sum(dim=-1)
            torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)

            total_log_probs = -(pred_exp_logits / sum_exp_logits).log().masked_fill(shift_labels.eq(-100), 0).sum()
            self.total_log_probs += total_log_probs

    def get_metric(self, reset=True):
        if is_no_pp_or_last_stage() and self.dp_pg is not None:
            torch.distributed.all_reduce(self.right, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
            torch.distributed.all_reduce(self.total, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
            torch.distributed.all_reduce(self.total_log_probs, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
            if hasattr(self, "total_type_count"):
                torch.distributed.all_reduce(self.ds_right, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
                torch.distributed.all_reduce(self.ds_tokens, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
            if self.tokenizer:
                torch.distributed.all_reduce(self.total_bytes, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)

        acc = round((self.right / self.total).item(), 4)
        perplexity = round(torch.exp(self.total_log_probs / self.total).item(), 4)
        bits_per_bytes = round((self.total_log_probs / self.total_bytes).item(), 4) if self.tokenizer else 0

        if hasattr(self, "total_type_count"):
            ds_acc = {}
            ds_tokens = {}
            for i in range(self.total_type_count):
                ds_acc[f"acc/{self.dataset_types[i]}"] = round(
                    (self.ds_right[i].float() / (self.ds_tokens[i].float() + 1e-5)).item(), 4
                )
                ds_tokens[f"tokens/{self.dataset_types[i]}"] = self.ds_tokens[i].item()
        if reset:
            self.right.fill_(0)
            self.total.fill_(0)
            self.total_log_probs.fill_(0)
            self.total_bytes.fill_(0)
            if hasattr(self, "total_type_count"):
                self.ds_right.fill_(0)
                self.ds_tokens.fill_(0)
        if self.tokenizer is not None:
            res = {"acc": acc, "perplexity": perplexity, "BPB": bits_per_bytes}
        else:
            res = {"acc": acc, "perplexity": perplexity}
        if hasattr(self, "total_type_count"):
            res.update(ds_acc)
            res.update(ds_tokens)

        loss_res = self.loss_with_type_id.get_metric()
        res.update(loss_res)

        return res


class LossWithTypeId:
    """
    Notice the loss value computed here may be not the same with the main info loss,
    cause loss here is the reduced result of the data parallel.
    """

    def __init__(self, device, dp_pg, dataset_types: List[str] = None) -> None:
        self.device = device
        self.dp_pg = dp_pg

        self.loss = torch.Tensor([0.0]).to(device=device)
        self.token_num = torch.Tensor([0.0]).to(device=device)

        if dataset_types is not None:
            self.dataset_types = dataset_types
            self.total_type_count = len(dataset_types)
            self.ds_loss = torch.zeros(self.total_type_count, dtype=torch.float, device=device)
            self.ds_token_num = torch.zeros(self.total_type_count, dtype=torch.float, device=device)

        self.loss_fn = FlashCrossEntropyLoss(
            reduction="none", inplace_backward=True, process_group=gpc.get_group(ParallelMode.TENSOR)
        )

    def update(self, logits, labels, type_ids=None):
        with torch.no_grad():
            if isinstance(logits, (list, tuple)):
                logits = logits[0]
            logits = logits.contiguous().view(-1, logits.size(-1))
            labels = labels.contiguous().view(-1)
            loss_list = self.loss_fn(logits, labels)

            cond = labels != -100
            real_loss_list = loss_list[cond]
            self.loss += real_loss_list.sum()
            self.token_num += real_loss_list.numel()

            if hasattr(self, "total_type_count"):
                type_ids = type_ids.contiguous().view(-1).to(self.device)
                real_type_ids = type_ids[cond]

                loss_list_type = scatter(real_loss_list, real_type_ids, dim=0, reduce="sum")
                token_num_type = scatter(torch.ones_like(real_loss_list), real_type_ids, dim=0, reduce="sum")

                if len(loss_list_type) < self.total_type_count:
                    loss_list_type = torch.cat(
                        [loss_list_type, loss_list_type.new_zeros(self.total_type_count - len(loss_list_type))]
                    )
                    token_num_type = torch.cat(
                        [token_num_type, token_num_type.new_zeros(self.total_type_count - len(token_num_type))]
                    )
                self.ds_loss += loss_list_type
                self.ds_token_num += token_num_type

    def get_metric(self, reset=True):
        if is_no_pp_or_last_stage() and self.dp_pg is not None:
            torch.distributed.all_reduce(self.loss, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
            torch.distributed.all_reduce(self.token_num, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
            if hasattr(self, "total_type_count"):
                torch.distributed.all_reduce(self.ds_loss, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)
                torch.distributed.all_reduce(self.ds_token_num, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)

        loss = round((self.loss / self.token_num).item(), 4)
        res = {
            "loss_from_metric": loss,
        }
        if hasattr(self, "total_type_count"):
            ds_loss = {}
            for i in range(self.total_type_count):
                ds_loss[f"loss/{self.dataset_types[i]}"] = round((self.ds_loss[i] / self.ds_token_num[i]).item(), 4)
            res.update(ds_loss)

        if reset:
            self.loss.fill_(0.0)
            self.token_num.fill_(0.0)
            if hasattr(self, "total_type_count"):
                self.ds_loss.fill_(0.0)
                self.ds_token_num.fill_(0.0)

        return res
feat(model/metrics.py): support calculating accuracy and perplexity m… (#91) * feat(model/metrics.py): support calculating accuracy and perplexity metrics * fix(model/metrics.py): fix import error * feat(train.py): minor update --------- Co-authored-by: 黄婷 <huangting3@CN0014010744M.local> Co-authored-by: huangting.p <huangting@sensetime.com> 2023-07-26 08:22:10 +00:00			`from typing import List`

			`import torch`
			`from flash_attn.losses.cross_entropy import CrossEntropyLoss as FlashCrossEntropyLoss`
			`from torch_scatter import scatter`

			`from internlm.core.context import ParallelMode`
			`from internlm.core.context import global_context as gpc`
			`from internlm.utils.parallel import is_no_pp_or_last_stage`


			`class AccPerplex:`
			`"""`
			`AccPerplex module for calculating model's accuracy and perplexity metrics.`

			`Args:`
			`device: The GPU device.`
			`tp_pg: The tensor parallel process group.`
			`dp_pg: The data parallel process group.`
			`tokenizer: For calculating BPB.`
			`dataset_types (List[str]): Various data types that will be used in the current training process,`
			`such as ['en', 'cn', 'code']. The order of the List should be consistent with the type_id specified`
			`in the dataset. Changed parameters need to be used in conjunction with set_current_type_ids().`
			`"""`

			`def __init__(self, device, tp_pg, dp_pg, tokenizer=None, dataset_types: List[str] = None):`
			`self.device = device`
			`self.right = torch.Tensor([0]).to(device=device)`
			`self.total = torch.Tensor([0]).to(device=device)`
			`self.total_log_probs = torch.Tensor([0]).to(device=device)`
			`self.tp_pg = tp_pg`
			`self.dp_pg = dp_pg`
			`self.tp_local_rank = torch.distributed.get_rank(self.tp_pg)`
			`self.tokenizer = tokenizer`
			`self.total_bytes = torch.Tensor([0]).to(device=device).view(1)`
			`self.batch_shift = 0`
			`self.type_ids = None`
			`if dataset_types is not None:`
			`self.dataset_types = dataset_types`
			`self.total_type_count = len(dataset_types)`
			`self.ds_right = torch.zeros(self.total_type_count, dtype=torch.long, device=device)`
			`self.ds_tokens = torch.zeros(self.total_type_count, dtype=torch.long, device=device)`

			`self.loss_with_type_id = LossWithTypeId(device, dp_pg, dataset_types)`

			`def set_current_type_ids(self, type_ids: torch.Tensor):`
			`self.batch_shift = 0`
			`self.type_ids = type_ids.cuda()`

			`def __call__(self, logits, labels):`
			`return self.update(logits, labels, type_ids=self.type_ids)`

			`def update(self, logits, labels, type_ids=None):`
feat(): support not-flash-attn for pp and no-pp (#145) support not flash attention for no-pp * support pipeline * modify the config * refactor the code * refactor the code * remove some unnecessary code 2023-07-28 08:13:04 +00:00			`if gpc.config.model.use_flash_attn:`
			`micro_bsz = labels.size(0)`
			`else:`
			`micro_bsz = 1`
feat(model/metrics.py): support calculating accuracy and perplexity m… (#91) * feat(model/metrics.py): support calculating accuracy and perplexity metrics * fix(model/metrics.py): fix import error * feat(train.py): minor update --------- Co-authored-by: 黄婷 <huangting3@CN0014010744M.local> Co-authored-by: huangting.p <huangting@sensetime.com> 2023-07-26 08:22:10 +00:00			`if type_ids is not None:`
			`type_ids = type_ids[self.batch_shift * micro_bsz : (self.batch_shift + 1) * micro_bsz].view(-1)`
			`self.batch_shift += 1`
			`self.loss_with_type_id.update(logits, labels, type_ids)`

			`with torch.no_grad():`
			`if isinstance(logits, (list, tuple)):`
			`logits = logits[0]`

			`logits = logits.detach().clone()`
			`labels = labels.detach().clone()`

			`if self.tokenizer: # need to calculate bits per bytes`
			`sequences = self.tokenizer.decode_ids(labels.tolist())`
			`self.total_bytes += sum(map(lambda x: len(x.encode("utf-8")), sequences))`

			`shift_logits = logits.view(-1, logits.size(-1))`
			`shift_labels = labels.view(-1)`
			`# There is a shift according to the current rank, because the logits are split`
			`pred_shift = self.tp_local_rank * logits.shape[-1]`

			`logits_max = torch.max(shift_logits, dim=-1)[0]`
			`torch.distributed.all_reduce(logits_max, op=torch.distributed.ReduceOp.MAX, group=self.tp_pg)`
			`# Determine whether the maximum value of the current local tensor is the global maximum value`
			`logits_global = logits_max == torch.max(shift_logits, dim=-1)[0]`

			`corrects = torch.logical_and(`
			`(shift_labels == (shift_logits.argmax(dim=-1) + pred_shift)), logits_global`
			`).long()`
			`mask = shift_labels.ne(-100).long()`
			`if hasattr(self, "total_type_count"):`
			`ds_acc = scatter(corrects, type_ids, dim=0, reduce="sum")`
			`token_num_type = scatter(mask, type_ids, dim=0, reduce="sum")`
			`if len(ds_acc) < self.total_type_count:`
			`ds_acc = torch.cat([ds_acc, ds_acc.new_zeros(self.total_type_count - len(ds_acc))])`
			`token_num_type = torch.cat(`
			`[token_num_type, token_num_type.new_zeros(self.total_type_count - len(token_num_type))]`
			`)`
			`self.ds_tokens += token_num_type`
			`sync_tensor = ds_acc`
			`torch.distributed.all_reduce(sync_tensor, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)`
			`self.ds_right += sync_tensor.view(-1)`

			`acc = corrects.sum()`
			`torch.distributed.all_reduce(acc, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)`
			`self.right += acc # Masked_fill is not needed here because -100 is not available anyway`
			`self.total += mask.sum()`

			`# Subtract the maximum value.`
			`shift_logits = shift_logits.sub(logits_max.unsqueeze(dim=-1))`

			`# Get the partition's vocab indecies`
			`partition_vocab_size = shift_logits.size()[-1]`
			`vocab_start_index = partition_vocab_size * self.tp_local_rank`
			`vocab_end_index = vocab_start_index + partition_vocab_size`

			`# Create a mask of valid vocab ids (1 means it needs to be masked).`
			`target_mask = (shift_labels < vocab_start_index) \| (shift_labels >= vocab_end_index)`
			`masked_target = shift_labels - vocab_start_index`
			`masked_target[target_mask] = 0`

			`# Get predicted-logits = logits[target].`
			`# For Simplicity, we convert logits to a 2-D tensor with size`
			`# [, partition-vocab-size] and target to a 1-D tensor of size [].`
			`logits_2d = shift_logits.view(-1, partition_vocab_size)`
			`masked_target_1d = masked_target.view(-1)`
			`arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)`
			`predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]`
			`predicted_logits_1d = predicted_logits_1d.clone().contiguous()`
			`predicted_logits = predicted_logits_1d.view_as(shift_labels) # bsz x max_len`
			`predicted_logits[target_mask] = 0.0`
			`# All reduce is needed to get the chunks from other GPUs.`
			`torch.distributed.all_reduce(predicted_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)`

			`pred_exp_logits = torch.exp(predicted_logits)`
			`# Sum of exponential of logits along vocab dimension across all GPUs.`
			`sum_exp_logits = torch.exp(shift_logits).sum(dim=-1)`
			`torch.distributed.all_reduce(sum_exp_logits, op=torch.distributed.ReduceOp.SUM, group=self.tp_pg)`

			`total_log_probs = -(pred_exp_logits / sum_exp_logits).log().masked_fill(shift_labels.eq(-100), 0).sum()`
			`self.total_log_probs += total_log_probs`

			`def get_metric(self, reset=True):`
			`if is_no_pp_or_last_stage() and self.dp_pg is not None:`
			`torch.distributed.all_reduce(self.right, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`torch.distributed.all_reduce(self.total, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`torch.distributed.all_reduce(self.total_log_probs, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`if hasattr(self, "total_type_count"):`
			`torch.distributed.all_reduce(self.ds_right, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`torch.distributed.all_reduce(self.ds_tokens, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`if self.tokenizer:`
			`torch.distributed.all_reduce(self.total_bytes, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`

			`acc = round((self.right / self.total).item(), 4)`
			`perplexity = round(torch.exp(self.total_log_probs / self.total).item(), 4)`
			`bits_per_bytes = round((self.total_log_probs / self.total_bytes).item(), 4) if self.tokenizer else 0`

			`if hasattr(self, "total_type_count"):`
			`ds_acc = {}`
			`ds_tokens = {}`
			`for i in range(self.total_type_count):`
			`ds_acc[f"acc/{self.dataset_types[i]}"] = round(`
			`(self.ds_right[i].float() / (self.ds_tokens[i].float() + 1e-5)).item(), 4`
			`)`
			`ds_tokens[f"tokens/{self.dataset_types[i]}"] = self.ds_tokens[i].item()`
			`if reset:`
			`self.right.fill_(0)`
			`self.total.fill_(0)`
			`self.total_log_probs.fill_(0)`
			`self.total_bytes.fill_(0)`
			`if hasattr(self, "total_type_count"):`
			`self.ds_right.fill_(0)`
			`self.ds_tokens.fill_(0)`
			`if self.tokenizer is not None:`
			`res = {"acc": acc, "perplexity": perplexity, "BPB": bits_per_bytes}`
			`else:`
			`res = {"acc": acc, "perplexity": perplexity}`
			`if hasattr(self, "total_type_count"):`
			`res.update(ds_acc)`
			`res.update(ds_tokens)`

			`loss_res = self.loss_with_type_id.get_metric()`
			`res.update(loss_res)`

			`return res`


			`class LossWithTypeId:`
			`"""`
			`Notice the loss value computed here may be not the same with the main info loss,`
			`cause loss here is the reduced result of the data parallel.`
			`"""`

			`def __init__(self, device, dp_pg, dataset_types: List[str] = None) -> None:`
			`self.device = device`
			`self.dp_pg = dp_pg`

			`self.loss = torch.Tensor([0.0]).to(device=device)`
			`self.token_num = torch.Tensor([0.0]).to(device=device)`

			`if dataset_types is not None:`
			`self.dataset_types = dataset_types`
			`self.total_type_count = len(dataset_types)`
			`self.ds_loss = torch.zeros(self.total_type_count, dtype=torch.float, device=device)`
			`self.ds_token_num = torch.zeros(self.total_type_count, dtype=torch.float, device=device)`

			`self.loss_fn = FlashCrossEntropyLoss(`
			`reduction="none", inplace_backward=True, process_group=gpc.get_group(ParallelMode.TENSOR)`
			`)`

			`def update(self, logits, labels, type_ids=None):`
			`with torch.no_grad():`
			`if isinstance(logits, (list, tuple)):`
			`logits = logits[0]`
			`logits = logits.contiguous().view(-1, logits.size(-1))`
			`labels = labels.contiguous().view(-1)`
			`loss_list = self.loss_fn(logits, labels)`

			`cond = labels != -100`
			`real_loss_list = loss_list[cond]`
			`self.loss += real_loss_list.sum()`
			`self.token_num += real_loss_list.numel()`

			`if hasattr(self, "total_type_count"):`
			`type_ids = type_ids.contiguous().view(-1).to(self.device)`
			`real_type_ids = type_ids[cond]`

			`loss_list_type = scatter(real_loss_list, real_type_ids, dim=0, reduce="sum")`
			`token_num_type = scatter(torch.ones_like(real_loss_list), real_type_ids, dim=0, reduce="sum")`

			`if len(loss_list_type) < self.total_type_count:`
			`loss_list_type = torch.cat(`
			`[loss_list_type, loss_list_type.new_zeros(self.total_type_count - len(loss_list_type))]`
			`)`
			`token_num_type = torch.cat(`
			`[token_num_type, token_num_type.new_zeros(self.total_type_count - len(token_num_type))]`
			`)`
			`self.ds_loss += loss_list_type`
			`self.ds_token_num += token_num_type`

			`def get_metric(self, reset=True):`
			`if is_no_pp_or_last_stage() and self.dp_pg is not None:`
			`torch.distributed.all_reduce(self.loss, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`torch.distributed.all_reduce(self.token_num, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`if hasattr(self, "total_type_count"):`
			`torch.distributed.all_reduce(self.ds_loss, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`
			`torch.distributed.all_reduce(self.ds_token_num, op=torch.distributed.ReduceOp.SUM, group=self.dp_pg)`

			`loss = round((self.loss / self.token_num).item(), 4)`
			`res = {`
			`"loss_from_metric": loss,`
			`}`
			`if hasattr(self, "total_type_count"):`
			`ds_loss = {}`
			`for i in range(self.total_type_count):`
			`ds_loss[f"loss/{self.dataset_types[i]}"] = round((self.ds_loss[i] / self.ds_token_num[i]).item(), 4)`
			`res.update(ds_loss)`

			`if reset:`
			`self.loss.fill_(0.0)`
			`self.token_num.fill_(0.0)`
			`if hasattr(self, "total_type_count"):`
			`self.ds_loss.fill_(0.0)`
			`self.ds_token_num.fill_(0.0)`

			`return res`