mirror of https://github.com/hpcaitech/ColossalAI
71 lines
2.2 KiB
Python
71 lines
2.2 KiB
Python
from collections import defaultdict
|
|
from copy import deepcopy
|
|
from typing import Dict, List
|
|
|
|
from colossal_eval.utils import jload
|
|
|
|
from colossalai.logging import DistributedLogger
|
|
|
|
from .base import BaseDataset
|
|
|
|
default_inference_kwargs = {
|
|
"calculate_loss": False,
|
|
"all_classes": None,
|
|
"language": "Chinese",
|
|
"pretrain": False,
|
|
"max_new_tokens": 256,
|
|
}
|
|
|
|
# You can add your own subcategory questions and specify whether it is a single-choice question or has target answers and need to calculate loss.
|
|
single_choice_question = set()
|
|
calculate_loss = set()
|
|
|
|
|
|
def get_data_per_category(data):
|
|
data_per_category = defaultdict(list)
|
|
for item in data:
|
|
category = item["category"]
|
|
data_per_category[category].append(item)
|
|
|
|
return data_per_category
|
|
|
|
|
|
class ColossalDataset(BaseDataset):
|
|
"""
|
|
Dataset class for Colossal dataset.
|
|
This dataset class will convert the original dataset into the inference dataset.
|
|
"""
|
|
|
|
@staticmethod
|
|
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
|
|
dataset = {"test": {}}
|
|
data = jload(path)
|
|
data_per_category = get_data_per_category(data)
|
|
categories = list(data_per_category.keys())
|
|
|
|
for category in categories:
|
|
dataset["test"][category] = {"data": []}
|
|
category_data = data_per_category[category]
|
|
|
|
dataset["test"][category]["inference_kwargs"] = deepcopy(default_inference_kwargs)
|
|
|
|
if category in calculate_loss:
|
|
dataset["test"][category]["inference_kwargs"]["calculate_loss"] = True
|
|
if category in single_choice_question:
|
|
dataset["test"][category]["inference_kwargs"]["all_classes"] = ["A", "B", "C", "D"]
|
|
|
|
for item in category_data:
|
|
data_sample = {
|
|
"dataset": "colossal",
|
|
"split": "test",
|
|
"category": category,
|
|
"instruction": item["instruction"],
|
|
"input": item["input"],
|
|
"output": "",
|
|
"target": item["target"],
|
|
"id": item["id"],
|
|
}
|
|
dataset["test"][category]["data"].append(data_sample)
|
|
|
|
return dataset
|