2023-06-12 07:02:27 +00:00
|
|
|
import torch
|
|
|
|
from datasets import load_dataset
|
2023-09-07 09:38:45 +00:00
|
|
|
from torch.utils.data import Dataset
|
|
|
|
|
2023-06-12 07:02:27 +00:00
|
|
|
|
|
|
|
class BeansDataset(Dataset):
|
2023-09-19 06:20:26 +00:00
|
|
|
def __init__(self, image_processor, tp_size=1, split="train"):
|
2023-06-12 07:02:27 +00:00
|
|
|
super().__init__()
|
|
|
|
self.image_processor = image_processor
|
2023-09-19 06:20:26 +00:00
|
|
|
self.ds = load_dataset("beans")[split]
|
|
|
|
self.label_names = self.ds.features["labels"].names
|
2023-09-07 09:38:45 +00:00
|
|
|
while len(self.label_names) % tp_size != 0:
|
|
|
|
# ensure that the number of labels is multiple of tp_size
|
|
|
|
self.label_names.append(f"pad_label_{len(self.label_names)}")
|
2023-06-12 07:02:27 +00:00
|
|
|
self.num_labels = len(self.label_names)
|
|
|
|
self.inputs = []
|
|
|
|
for example in self.ds:
|
|
|
|
self.inputs.append(self.process_example(example))
|
2023-09-07 09:38:45 +00:00
|
|
|
|
2023-06-12 07:02:27 +00:00
|
|
|
def __len__(self):
|
|
|
|
return len(self.inputs)
|
|
|
|
|
|
|
|
def __getitem__(self, idx):
|
|
|
|
return self.inputs[idx]
|
2023-09-07 09:38:45 +00:00
|
|
|
|
2023-06-12 07:02:27 +00:00
|
|
|
def process_example(self, example):
|
2023-09-19 06:20:26 +00:00
|
|
|
input = self.image_processor(example["image"], return_tensors="pt")
|
|
|
|
input["labels"] = example["labels"]
|
2023-06-12 07:02:27 +00:00
|
|
|
return input
|
2023-09-07 09:38:45 +00:00
|
|
|
|
2023-06-12 07:02:27 +00:00
|
|
|
|
|
|
|
def beans_collator(batch):
|
2023-09-07 09:38:45 +00:00
|
|
|
return {
|
2023-09-19 06:20:26 +00:00
|
|
|
"pixel_values": torch.cat([data["pixel_values"] for data in batch], dim=0),
|
|
|
|
"labels": torch.tensor([data["labels"] for data in batch], dtype=torch.int64),
|
2023-09-07 09:38:45 +00:00
|
|
|
}
|