ColossalAI/applications/Chat/coati/dataset/prompt_dataset.py

from collections import defaultdict
from typing import Dict

import torch
import transformers
from torch.utils.data import Dataset

from colossalai.logging import get_dist_logger

from .utils import jload


class PromptDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self,
                 data_path: str,
                 tokenizer: transformers.PreTrainedTokenizer,
                 max_datasets_size: int = None,
                 max_length: int = 96):
        super(PromptDataset, self).__init__()
        self.keyed_prompt = defaultdict(list)
        self.logger = get_dist_logger()
        self.logger.info("Loading data...")
        list_data_dict = jload(data_path)
        self.logger.info(f"Loaded {len(list_data_dict)} examples.")

        if max_datasets_size is not None:
            self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
            list_data_dict = list_data_dict[:max_datasets_size]

        instructions = [data_dict["instruction"] for data_dict in list_data_dict]
        tokens = tokenizer(instructions,
                           return_tensors='pt',
                           max_length=max_length,
                           padding='max_length',
                           truncation=True)
        for k, tensor in tokens.items():
            self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()

    def __len__(self):
        return len(self.keyed_prompt["input_ids"])

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return {k: v[i] for k, v in self.keyed_prompt.items()}