mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
46 lines
1.5 KiB
46 lines
1.5 KiB
from collections import defaultdict
|
|
from typing import Dict
|
|
|
|
import torch
|
|
import transformers
|
|
from torch.utils.data import Dataset
|
|
|
|
from colossalai.logging import get_dist_logger
|
|
|
|
from .utils import jload
|
|
|
|
|
|
class PromptDataset(Dataset):
|
|
"""Dataset for supervised fine-tuning."""
|
|
|
|
def __init__(
|
|
self,
|
|
data_path: str,
|
|
tokenizer: transformers.PreTrainedTokenizer,
|
|
max_datasets_size: int = None,
|
|
max_length: int = 96,
|
|
):
|
|
super(PromptDataset, self).__init__()
|
|
self.keyed_prompt = defaultdict(list)
|
|
self.logger = get_dist_logger()
|
|
self.logger.info("Loading data...")
|
|
list_data_dict = jload(data_path)
|
|
self.logger.info(f"Loaded {len(list_data_dict)} examples.")
|
|
|
|
if max_datasets_size is not None:
|
|
self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
|
|
list_data_dict = list_data_dict[:max_datasets_size]
|
|
|
|
instructions = [data_dict["instruction"] for data_dict in list_data_dict]
|
|
tokens = tokenizer(
|
|
instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
|
|
)
|
|
for k, tensor in tokens.items():
|
|
self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()
|
|
|
|
def __len__(self):
|
|
return len(self.keyed_prompt["input_ids"])
|
|
|
|
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
|
|
return {k: v[i] for k, v in self.keyed_prompt.items()}
|