ColossalAI/applications/Chat/coati/dataset/sft_dataset.py

#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.

import copy
import random
from dataclasses import dataclass, field
from typing import Callable, Dict, Sequence

import torch
import torch.distributed as dist
import transformers
from torch.utils.data import Dataset
from tqdm import tqdm

from colossalai.logging import get_dist_logger

from .utils import is_rank_0, jload

logger = get_dist_logger()

IGNORE_INDEX = -100
PROMPT_DICT = {
    "prompt_input":
        ("Below is an instruction that describes a task, paired with an input that provides further context. "
         "Write a response that appropriately completes the request.\n\n"
         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
    "prompt_no_input": ("Below is an instruction that describes a task. "
                        "Write a response that appropriately completes the request.\n\n"
                        "### Instruction:\n{instruction}\n\n### Response:"),
}


class SFTDataset(Dataset):
    """
    Dataset for sft model

    Args:
        dataset: dataset for supervised model
        tokenizer: tokenizer for supervised model
        max_length: max length of input
    """

    def __init__(self, dataset, tokenizer: Callable, max_length: int = 512) -> None:
        super().__init__()
        self.input_ids = []

        for data in tqdm(dataset, disable=not is_rank_0()):
            prompt = data['prompt'] + data['completion'] + tokenizer.eos_token
            prompt_token = tokenizer(prompt,
                                     max_length=max_length,
                                     padding="max_length",
                                     truncation=True,
                                     return_tensors="pt")

            self.input_ids.append(prompt_token['input_ids'][0])
        self.labels = copy.deepcopy(self.input_ids)

    def __len__(self):
        length = len(self.input_ids)
        return length

    def __getitem__(self, idx):
        return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])


def _tokenize_fn(strings: Sequence[str],
                 tokenizer: transformers.PreTrainedTokenizer,
                 max_length: int
                 ) -> Dict[str, torch.Tensor]:
    """Tokenize a list of strings."""
    tokenized_list = tokenizer(
        strings, return_tensors="pt", padding="longest",
        max_length=max_length, truncation=True
    )
    input_ids = labels = tokenized_list["input_ids"]
    input_ids_lens = labels_lens = \
        tokenized_list["input_ids"].ne(tokenizer.pad_token_id).sum(dim=-1)
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
    max_length: int,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [
        _tokenize_fn(strings, tokenizer, max_length)
        for strings in (examples, sources)
    ]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, max_datasets_size: int = None, max_length: int = 512):
        super(SupervisedDataset, self).__init__()
        logger.info("Loading data...")
        list_data_dict = jload(data_path)
        logger.info(f"Loaded {len(list_data_dict)} examples.")

        if max_datasets_size is not None:
            logger.info(f"Limiting dataset to {max_datasets_size} examples.")
            list_data_dict = list_data_dict[:max_datasets_size]

        logger.info("Formatting inputs...")
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
        sources = [
            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
            for example in list_data_dict
        ]
        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

        logger.info("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer, max_length)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids,
                                                    batch_first=True,
                                                    padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import copy`
			`import random`
			`from dataclasses import dataclass, field`
			`from typing import Callable, Dict, Sequence`

			`import torch`
			`import torch.distributed as dist`
			`import transformers`
			`from torch.utils.data import Dataset`
			`from tqdm import tqdm`

			`from colossalai.logging import get_dist_logger`

			`from .utils import is_rank_0, jload`

			`logger = get_dist_logger()`

			`IGNORE_INDEX = -100`
			`PROMPT_DICT = {`
			`"prompt_input":`
			`("Below is an instruction that describes a task, paired with an input that provides further context. "`
			`"Write a response that appropriately completes the request.\n\n"`
			`"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),`
			`"prompt_no_input": ("Below is an instruction that describes a task. "`
			`"Write a response that appropriately completes the request.\n\n"`
			`"### Instruction:\n{instruction}\n\n### Response:"),`
			`}`


			`class SFTDataset(Dataset):`
			`"""`
			`Dataset for sft model`

			`Args:`
			`dataset: dataset for supervised model`
			`tokenizer: tokenizer for supervised model`
			`max_length: max length of input`
			`"""`

			`def __init__(self, dataset, tokenizer: Callable, max_length: int = 512) -> None:`
			`super().__init__()`
			`self.input_ids = []`

			`for data in tqdm(dataset, disable=not is_rank_0()):`
fix: fix sft (#3568) 2023-04-17 08:47:44 +00:00			`prompt = data['prompt'] + data['completion'] + tokenizer.eos_token`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`prompt_token = tokenizer(prompt,`
			`max_length=max_length,`
			`padding="max_length",`
			`truncation=True,`
			`return_tensors="pt")`

fix: fix sft (#3568) 2023-04-17 08:47:44 +00:00			`self.input_ids.append(prompt_token['input_ids'][0])`
			`self.labels = copy.deepcopy(self.input_ids)`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00
			`def __len__(self):`
fix: fix sft (#3568) 2023-04-17 08:47:44 +00:00			`length = len(self.input_ids)`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`return length`

			`def __getitem__(self, idx):`
			`return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])`


[chat] refactor actor class (#3968) * refactor: separate log_probs fn from Actor forward fn * refactor: separate generate fn from Actor class * feat: update unwrap_model and get_base_model * unwrap_model returns model not wrapped by Strategy * get_base_model returns HF model for Actor, Critic and RewardModel * feat: simplify Strategy.prepare * style: remove get_base_model method of Actor * perf: tokenize text in batches * refactor: move calc_action_log_probs to utils of model * test: update test with new forward fn * style: rename forward fn args * fix: do not unwrap model in save_model fn of naive strategy * test: add gemini test for train_prompts * fix: fix _set_default_generate_kwargs 2023-06-13 05:31:56 +00:00			`def _tokenize_fn(strings: Sequence[str],`
			`tokenizer: transformers.PreTrainedTokenizer,`
			`max_length: int`
			`) -> Dict[str, torch.Tensor]:`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`"""Tokenize a list of strings."""`
[chat] refactor actor class (#3968) * refactor: separate log_probs fn from Actor forward fn * refactor: separate generate fn from Actor class * feat: update unwrap_model and get_base_model * unwrap_model returns model not wrapped by Strategy * get_base_model returns HF model for Actor, Critic and RewardModel * feat: simplify Strategy.prepare * style: remove get_base_model method of Actor * perf: tokenize text in batches * refactor: move calc_action_log_probs to utils of model * test: update test with new forward fn * style: rename forward fn args * fix: do not unwrap model in save_model fn of naive strategy * test: add gemini test for train_prompts * fix: fix _set_default_generate_kwargs 2023-06-13 05:31:56 +00:00			`tokenized_list = tokenizer(`
			`strings, return_tensors="pt", padding="longest",`
			`max_length=max_length, truncation=True`
			`)`
			`input_ids = labels = tokenized_list["input_ids"]`
			`input_ids_lens = labels_lens = \`
			`tokenized_list["input_ids"].ne(tokenizer.pad_token_id).sum(dim=-1)`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`return dict(`
			`input_ids=input_ids,`
			`labels=labels,`
			`input_ids_lens=input_ids_lens,`
			`labels_lens=labels_lens,`
			`)`


			`def preprocess(`
			`sources: Sequence[str],`
			`targets: Sequence[str],`
			`tokenizer: transformers.PreTrainedTokenizer,`
[Chat] fix the tokenizer "int too big to convert" error in SFT training (#3453) * Add RoBERTa for RLHF Stage 2 & 3 (test) RoBERTa for RLHF Stage 2 & 3 (still in testing) * Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)" This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368. * Add RoBERTa for RLHF stage 2 & 3 1. add roberta folder under model folder 2. add roberta option in train_reward_model.py 3. add some test in testci * Update test_ci.sh * Revert "Update test_ci.sh" This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a. * Add RoBERTa for RLHF Stage 2 & 3 (test) RoBERTa for RLHF Stage 2 & 3 (still in testing) * Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)" This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368. * Add RoBERTa for RLHF stage 2 & 3 1. add roberta folder under model folder 2. add roberta option in train_reward_model.py 3. add some test in testci * Update test_ci.sh * Revert "Update test_ci.sh" This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a. * update roberta with coati * chat ci update * Revert "chat ci update" This reverts commit 17ae7ae01fa752bd3289fc39069868fde99cf846. * [Chat] fix the tokenizer "int too big to convert" error in SFT training fix the tokenizer error during SFT training using Bloom and OPT 2023-04-06 01:30:28 +00:00			`max_length: int,`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`) -> Dict:`
			`"""Preprocess the data by tokenizing."""`
			`examples = [s + t for s, t in zip(sources, targets)]`
[chat] refactor actor class (#3968) * refactor: separate log_probs fn from Actor forward fn * refactor: separate generate fn from Actor class * feat: update unwrap_model and get_base_model * unwrap_model returns model not wrapped by Strategy * get_base_model returns HF model for Actor, Critic and RewardModel * feat: simplify Strategy.prepare * style: remove get_base_model method of Actor * perf: tokenize text in batches * refactor: move calc_action_log_probs to utils of model * test: update test with new forward fn * style: rename forward fn args * fix: do not unwrap model in save_model fn of naive strategy * test: add gemini test for train_prompts * fix: fix _set_default_generate_kwargs 2023-06-13 05:31:56 +00:00			`examples_tokenized, sources_tokenized = [`
			`_tokenize_fn(strings, tokenizer, max_length)`
			`for strings in (examples, sources)`
			`]`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`input_ids = examples_tokenized["input_ids"]`
			`labels = copy.deepcopy(input_ids)`
			`for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):`
			`label[:source_len] = IGNORE_INDEX`
			`return dict(input_ids=input_ids, labels=labels)`


			`class SupervisedDataset(Dataset):`
			`"""Dataset for supervised fine-tuning."""`

[Chat] fix the tokenizer "int too big to convert" error in SFT training (#3453) * Add RoBERTa for RLHF Stage 2 & 3 (test) RoBERTa for RLHF Stage 2 & 3 (still in testing) * Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)" This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368. * Add RoBERTa for RLHF stage 2 & 3 1. add roberta folder under model folder 2. add roberta option in train_reward_model.py 3. add some test in testci * Update test_ci.sh * Revert "Update test_ci.sh" This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a. * Add RoBERTa for RLHF Stage 2 & 3 (test) RoBERTa for RLHF Stage 2 & 3 (still in testing) * Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)" This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368. * Add RoBERTa for RLHF stage 2 & 3 1. add roberta folder under model folder 2. add roberta option in train_reward_model.py 3. add some test in testci * Update test_ci.sh * Revert "Update test_ci.sh" This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a. * update roberta with coati * chat ci update * Revert "chat ci update" This reverts commit 17ae7ae01fa752bd3289fc39069868fde99cf846. * [Chat] fix the tokenizer "int too big to convert" error in SFT training fix the tokenizer error during SFT training using Bloom and OPT 2023-04-06 01:30:28 +00:00			`def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, max_datasets_size: int = None, max_length: int = 512):`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00			`super(SupervisedDataset, self).__init__()`
			`logger.info("Loading data...")`
			`list_data_dict = jload(data_path)`
			`logger.info(f"Loaded {len(list_data_dict)} examples.")`

			`if max_datasets_size is not None:`
			`logger.info(f"Limiting dataset to {max_datasets_size} examples.")`
			`list_data_dict = list_data_dict[:max_datasets_size]`

			`logger.info("Formatting inputs...")`
			`prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]`
			`sources = [`
			`prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)`
			`for example in list_data_dict`
			`]`
			`targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]`

			`logger.info("Tokenizing inputs... This may take some time...")`
[Chat] fix the tokenizer "int too big to convert" error in SFT training (#3453) * Add RoBERTa for RLHF Stage 2 & 3 (test) RoBERTa for RLHF Stage 2 & 3 (still in testing) * Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)" This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368. * Add RoBERTa for RLHF stage 2 & 3 1. add roberta folder under model folder 2. add roberta option in train_reward_model.py 3. add some test in testci * Update test_ci.sh * Revert "Update test_ci.sh" This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a. * Add RoBERTa for RLHF Stage 2 & 3 (test) RoBERTa for RLHF Stage 2 & 3 (still in testing) * Revert "Add RoBERTa for RLHF Stage 2 & 3 (test)" This reverts commit 06741d894dcbe958acd4e10d771f22275e20e368. * Add RoBERTa for RLHF stage 2 & 3 1. add roberta folder under model folder 2. add roberta option in train_reward_model.py 3. add some test in testci * Update test_ci.sh * Revert "Update test_ci.sh" This reverts commit 9c7352b81766f3177d31eeec0ec178a301df966a. * update roberta with coati * chat ci update * Revert "chat ci update" This reverts commit 17ae7ae01fa752bd3289fc39069868fde99cf846. * [Chat] fix the tokenizer "int too big to convert" error in SFT training fix the tokenizer error during SFT training using Bloom and OPT 2023-04-06 01:30:28 +00:00			`data_dict = preprocess(sources, targets, tokenizer, max_length)`
[Coati] first commit (#3283) 2023-03-28 12:25:36 +00:00
			`self.input_ids = data_dict["input_ids"]`
			`self.labels = data_dict["labels"]`

			`def __len__(self):`
			`return len(self.input_ids)`

			`def __getitem__(self, i) -> Dict[str, torch.Tensor]:`
			`return dict(input_ids=self.input_ids[i], labels=self.labels[i])`


			`@dataclass`
			`class DataCollatorForSupervisedDataset(object):`
			`"""Collate examples for supervised fine-tuning."""`

			`tokenizer: transformers.PreTrainedTokenizer`

			`def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:`
			`input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))`
			`input_ids = torch.nn.utils.rnn.pad_sequence(input_ids,`
			`batch_first=True,`
			`padding_value=self.tokenizer.pad_token_id)`
			`labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)`
			`return dict(`
			`input_ids=input_ids,`
			`labels=labels,`
			`attention_mask=input_ids.ne(self.tokenizer.pad_token_id),`
			`)`