update tokenization function

2024-11-11 07:26:32 +00:00 · 2024-11-11 07:26:32 +00:00 · 1210dbea97
parent dcb509c8e3
commit 1210dbea97
1 changed files with 45 additions and 1 deletions
--- a/applications/ColossalChat/coati/dataset/tokenization_utils.py
+++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py
@ -1,13 +1,14 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-tokenization utils for constructing dataset for ppo, dpo, sft, rm
+Tokenization Utils for Constructing Dataset for RL.
 """

 import warnings
 from copy import deepcopy
 from typing import Any, Dict, List, Union

+import torch
 from coati.dataset.conversation import Conversation
 from coati.dataset.utils import split_templated_prompt_into_chunks, tokenize_and_concatenate
 from datasets import dataset_dict
@ -393,3 +394,46 @@ def tokenize_kto(
        "input_id_decode": decoded_full_prompt,
        "completion_decode": decoded_completion,
    }
+
+
+def tokenize_process_reward(
+    data_point: Dict[str, str],
+    tokenizer: PreTrainedTokenizer,
+    conversation_template: Conversation = None,
+    max_length: int = 4096,
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    Tokenize function designed for tokenizing Math-Shepherd dataset.
+
+    The datapoint has the following format:
+    {
+        "input": problem + step-by-step solution,
+        "label": problem + step-by-step solution with automatic label,
+        "task": GSM8K or MATH
+    }
+
+    """
+    input = data_point["input"]
+    label = data_point["label"]
+
+    template = deepcopy(conversation_template)
+    template.append_message("user", input)
+    template.append_message("assistant", label)
+    prompt = template.get_prompt(add_generation_prompt=True)
+    reward_signal_id = tokenizer.convert_tokens_to_ids(template.reward_signal)
+    tokenized = tokenizer(prompt, add_special_tokens=False)["input_ids"]
+
+    tokenized_tensor = torch.tensor(tokenized)
+    loss_mask = torch.isin(tokenized_tensor, torch.tensor(reward_signal_id))
+
+    label = (tokenized_tensor * loss_mask).tolist()
+    decoded_input = tokenizer.decode(tokenized, skip_special_tokens=False)
+    decoded_label = tokenizer.decode(label, skip_special_tokens=False)
+
+    return {
+        "input": tokenized,
+        "label": label,
+        "loss_mask": loss_mask,
+        "decoded_input": decoded_input,
+        "decoded_label": decoded_label,
+    }