ColossalAI/applications/ColossalChat/coati/dataset/utils.py

import io
import json
from typing import Any, Dict, List

import torch
import torch.distributed as dist
import torch.nn.functional as F
from transformers import PreTrainedTokenizer


def is_rank_0() -> bool:
    return not dist.is_initialized() or dist.get_rank() == 0


def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict


def read_string_by_schema(data: Dict[str, Any], schema: str) -> str:
    """
    Read a feild of the dataset be schema
    Args:
        data: Dict[str, Any]
        schema: cascaded feild names seperated by '.'. e.g. person.name.first will access data['person']['name']['first']
    """
    keys = schema.split(".")
    result = data
    for key in keys:
        result = result.get(key, None)
        if result is None:
            return ""
    assert isinstance(result, str), f"dataset element is not a string: {result}"
    return result


def pad_to_max_len(
    sequence: List[torch.Tensor], max_length: int, padding_value: int, batch_first: bool = True, padding_side="left"
):
    """
    Args:
        sequence: a batch of tensor of shape [batch_size, seq_len] if batch_first==True
    """
    if padding_side == "left":
        reversed_sequence = [seq.flip(dims=(0,)) for seq in sequence]
        padded = torch.nn.utils.rnn.pad_sequence(
            sequences=reversed_sequence, batch_first=batch_first, padding_value=padding_value
        )
        to_pad = max_length - padded.size(1)
        padded = F.pad(padded, (0, to_pad), value=padding_value)
        return torch.flip(padded, dims=(1,))
    elif padding_side == "right":
        padded = torch.nn.utils.rnn.pad_sequence(
            sequences=sequence, batch_first=batch_first, padding_value=padding_value
        )
        to_pad = max_length - padded.size(1)
        return F.pad(padded, (0, to_pad), value=padding_value)
    else:
        raise RuntimeError(f"`padding_side` can only be `left` or `right`, " f"but now `{padding_side}`")


def chuncate_sequence(sequence: List[torch.Tensor], max_length: int, dtype: Any):
    """
    Args:
        sequence: a batch of tensor of shape [batch_size, seq_len] if batch_first==True
    """
    return [
        torch.Tensor(seq[:max_length]).to(dtype) if len(seq) > max_length else torch.Tensor(seq).to(dtype)
        for seq in sequence
    ]


def find_first_occurrence_subsequence(seq: torch.Tensor, subseq: torch.Tensor, start_index: int = 0) -> int:
    if subseq is None:
        return 0
    for i in range(start_index, len(seq) - len(subseq) + 1):
        if torch.all(seq[i : i + len(subseq)] == subseq):
            return i
    return -1


def tokenize_and_concatenate(tokenizer: PreTrainedTokenizer, text: List[str], require_loss: List[bool]):
    """
    Tokenizes a list of texts using the provided tokenizer and concatenates the tokenized outputs.

    Args:
        tokenizer (PreTrainedTokenizer): The tokenizer to use for tokenization.
        text (List[str]): The list of texts to tokenize.
        require_loss (List[bool]): A list of boolean values indicating whether each text requires loss calculation.

    Returns:
        Tuple[List[int], List[int], List[int]]: A tuple containing the concatenated tokenized input ids,
        the start positions of loss spans, and the end positions of loss spans.
    """
    input_ids = []
    loss_starts = []
    loss_ends = []
    for s, r in zip(text, require_loss):
        tokenized = tokenizer(s, add_special_tokens=False)["input_ids"]
        if r:
            loss_starts.append(len(input_ids))
            loss_ends.append(len(input_ids) + len(tokenized))
        input_ids.extend(tokenized)
    return input_ids, loss_starts, loss_ends


def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: str):
    # Seperate templated prompt into chunks by human/assistant's lines, prepare data for tokenize_and_concatenate
    start_idx = 0
    chunks = []
    require_loss = []
    for line in messages:
        first_occur = prompt.find(line["content"], start_idx)
        if prompt[first_occur - 1] != " ":
            chunks.append(prompt[start_idx:first_occur])
            chunks.append(prompt[first_occur : first_occur + len(line["content"])])
        else:
            chunks.append(prompt[start_idx : first_occur - 1])
            chunks.append(prompt[first_occur - 1 : first_occur + len(line["content"])])
        start_idx = first_occur + len(line["content"])
        if line["role"].lower() == "assistant":
            require_loss.append(False)
            require_loss.append(True)
        else:
            require_loss.append(False)
            require_loss.append(False)
    chunks.append(prompt[start_idx:])
    require_loss.append(False)
    return chunks, require_loss
[ColossalChat] Update RLHF V2 (#5286) * Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com> 8 months ago			`import io`
			`import json`
			`from typing import Any, Dict, List`

			`import torch`
			`import torch.distributed as dist`
			`import torch.nn.functional as F`
			`from transformers import PreTrainedTokenizer`


			`def is_rank_0() -> bool:`
			`return not dist.is_initialized() or dist.get_rank() == 0`


			`def _make_r_io_base(f, mode: str):`
			`if not isinstance(f, io.IOBase):`
			`f = open(f, mode=mode)`
			`return f`


			`def jload(f, mode="r"):`
			`"""Load a .json file into a dictionary."""`
			`f = _make_r_io_base(f, mode)`
			`jdict = json.load(f)`
			`f.close()`
			`return jdict`


			`def read_string_by_schema(data: Dict[str, Any], schema: str) -> str:`
			`"""`
			`Read a feild of the dataset be schema`
			`Args:`
			`data: Dict[str, Any]`
			`schema: cascaded feild names seperated by '.'. e.g. person.name.first will access data['person']['name']['first']`
			`"""`
			`keys = schema.split(".")`
			`result = data`
			`for key in keys:`
			`result = result.get(key, None)`
			`if result is None:`
			`return ""`
			`assert isinstance(result, str), f"dataset element is not a string: {result}"`
			`return result`


			`def pad_to_max_len(`
			`sequence: List[torch.Tensor], max_length: int, padding_value: int, batch_first: bool = True, padding_side="left"`
			`):`
			`"""`
			`Args:`
			`sequence: a batch of tensor of shape [batch_size, seq_len] if batch_first==True`
			`"""`
			`if padding_side == "left":`
			`reversed_sequence = [seq.flip(dims=(0,)) for seq in sequence]`
			`padded = torch.nn.utils.rnn.pad_sequence(`
			`sequences=reversed_sequence, batch_first=batch_first, padding_value=padding_value`
			`)`
			`to_pad = max_length - padded.size(1)`
			`padded = F.pad(padded, (0, to_pad), value=padding_value)`
			`return torch.flip(padded, dims=(1,))`
			`elif padding_side == "right":`
			`padded = torch.nn.utils.rnn.pad_sequence(`
			`sequences=sequence, batch_first=batch_first, padding_value=padding_value`
			`)`
			`to_pad = max_length - padded.size(1)`
			`return F.pad(padded, (0, to_pad), value=padding_value)`
			`else:`
			raise RuntimeError(f"`padding_side` can only be `left` or `right`, " f"but now `{padding_side}`")


			`def chuncate_sequence(sequence: List[torch.Tensor], max_length: int, dtype: Any):`
			`"""`
			`Args:`
			`sequence: a batch of tensor of shape [batch_size, seq_len] if batch_first==True`
			`"""`
			`return [`
			`torch.Tensor(seq[:max_length]).to(dtype) if len(seq) > max_length else torch.Tensor(seq).to(dtype)`
			`for seq in sequence`
			`]`


			`def find_first_occurrence_subsequence(seq: torch.Tensor, subseq: torch.Tensor, start_index: int = 0) -> int:`
			`if subseq is None:`
			`return 0`
			`for i in range(start_index, len(seq) - len(subseq) + 1):`
			`if torch.all(seq[i : i + len(subseq)] == subseq):`
			`return i`
			`return -1`


			`def tokenize_and_concatenate(tokenizer: PreTrainedTokenizer, text: List[str], require_loss: List[bool]):`
			`"""`
			`Tokenizes a list of texts using the provided tokenizer and concatenates the tokenized outputs.`

			`Args:`
			`tokenizer (PreTrainedTokenizer): The tokenizer to use for tokenization.`
			`text (List[str]): The list of texts to tokenize.`
			`require_loss (List[bool]): A list of boolean values indicating whether each text requires loss calculation.`

			`Returns:`
			`Tuple[List[int], List[int], List[int]]: A tuple containing the concatenated tokenized input ids,`
			`the start positions of loss spans, and the end positions of loss spans.`
			`"""`
			`input_ids = []`
			`loss_starts = []`
			`loss_ends = []`
			`for s, r in zip(text, require_loss):`
			`tokenized = tokenizer(s, add_special_tokens=False)["input_ids"]`
			`if r:`
			`loss_starts.append(len(input_ids))`
			`loss_ends.append(len(input_ids) + len(tokenized))`
			`input_ids.extend(tokenized)`
			`return input_ids, loss_starts, loss_ends`


			`def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: str):`
			`# Seperate templated prompt into chunks by human/assistant's lines, prepare data for tokenize_and_concatenate`
			`start_idx = 0`
			`chunks = []`
			`require_loss = []`
			`for line in messages:`
			`first_occur = prompt.find(line["content"], start_idx)`
			`if prompt[first_occur - 1] != " ":`
			`chunks.append(prompt[start_idx:first_occur])`
			`chunks.append(prompt[first_occur : first_occur + len(line["content"])])`
			`else:`
			`chunks.append(prompt[start_idx : first_occur - 1])`
			`chunks.append(prompt[first_occur - 1 : first_occur + len(line["content"])])`
			`start_idx = first_occur + len(line["content"])`
			`if line["role"].lower() == "assistant":`
			`require_loss.append(False)`
			`require_loss.append(True)`
			`else:`
			`require_loss.append(False)`
			`require_loss.append(False)`
			`chunks.append(prompt[start_idx:])`
			`require_loss.append(False)`
			`return chunks, require_loss`