ColossalAI/applications/Chat/coati/dataset/prompt_dataset.py

from collections import defaultdict
from typing import Dict

import torch
import transformers
from torch.utils.data import Dataset

from colossalai.logging import get_dist_logger

from .utils import jload


class PromptDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(
        self,
        data_path: str,
        tokenizer: transformers.PreTrainedTokenizer,
        max_datasets_size: int = None,
        max_length: int = 96,
    ):
        super(PromptDataset, self).__init__()
        self.keyed_prompt = defaultdict(list)
        self.logger = get_dist_logger()
        self.logger.info("Loading data...")
        list_data_dict = jload(data_path)
        self.logger.info(f"Loaded {len(list_data_dict)} examples.")

        if max_datasets_size is not None:
            self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
            list_data_dict = list_data_dict[:max_datasets_size]

        instructions = [data_dict["instruction"] for data_dict in list_data_dict]
        tokens = tokenizer(
            instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
        )
        for k, tensor in tokens.items():
            self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()

    def __len__(self):
        return len(self.keyed_prompt["input_ids"])

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return {k: v[i] for k, v in self.keyed_prompt.items()}