mirror of https://github.com/hpcaitech/ColossalAI
123 lines
4.5 KiB
Python
123 lines
4.5 KiB
Python
|
import json
|
|||
|
import os
|
|||
|
import re
|
|||
|
from copy import deepcopy
|
|||
|
from typing import Dict, List
|
|||
|
|
|||
|
from colossalai.logging import DistributedLogger
|
|||
|
|
|||
|
from .base import BaseDataset
|
|||
|
|
|||
|
multi_choice_datasets = [
|
|||
|
"Chinese Lang and Usage MCQs",
|
|||
|
"Chinese Modern Lit",
|
|||
|
"English Fill in Blanks",
|
|||
|
"English Reading Comp",
|
|||
|
"Geography MCQs",
|
|||
|
"Physics MCQs",
|
|||
|
"English Cloze Test",
|
|||
|
]
|
|||
|
|
|||
|
chinese_qa_datasets = [
|
|||
|
"Biology MCQs",
|
|||
|
"Chemistry MCQs",
|
|||
|
"Chinese Lang and Usage MCQs",
|
|||
|
"Chinese Modern Lit",
|
|||
|
"Geography MCQs",
|
|||
|
"History MCQs",
|
|||
|
"Math I MCQs",
|
|||
|
"Math II MCQs",
|
|||
|
"Physics MCQs",
|
|||
|
"Political Science MCQs",
|
|||
|
]
|
|||
|
english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
|
|||
|
|
|||
|
default_inference_kwargs = {
|
|||
|
"calculate_loss": True,
|
|||
|
"all_classes": None,
|
|||
|
"language": "Chinese",
|
|||
|
"pretrain": False,
|
|||
|
"max_new_tokens": 32,
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def get_all_classes(instruction: str):
|
|||
|
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|||
|
pattern = r"([A-Z]\. |[A-Z].|[A-Z]\.)"
|
|||
|
options = sorted(list(set(re.findall(pattern, instruction))))
|
|||
|
options = sorted(list(set([string[0] for string in options])))
|
|||
|
|
|||
|
for i in range(len(options)):
|
|||
|
if options[i] == letters[i]:
|
|||
|
continue
|
|||
|
else:
|
|||
|
return options[0:i]
|
|||
|
return options
|
|||
|
|
|||
|
|
|||
|
class GaoKaoBenchDataset(BaseDataset):
|
|||
|
"""
|
|||
|
Dataset class for GAOKAO-Bench dataset.
|
|||
|
Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
|
|||
|
This dataset class will convert the original dataset into the inference dataset.
|
|||
|
|
|||
|
A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
|
|||
|
Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
|
|||
|
1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
|
|||
|
2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
|
|||
|
3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
|
|||
|
"""
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
|
|||
|
dataset = {"test": {}}
|
|||
|
for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
|
|||
|
files = os.listdir(os.path.join(path, "data", category))
|
|||
|
files.sort()
|
|||
|
|
|||
|
for file in files:
|
|||
|
subject = file[10:-5].split("_")
|
|||
|
subject = " ".join(subject)
|
|||
|
dataset["test"][subject] = {"data": []}
|
|||
|
|
|||
|
file_dir = os.path.join(path, "data", category, file)
|
|||
|
|
|||
|
with open(file_dir, encoding="utf-8") as f:
|
|||
|
data = json.load(f)
|
|||
|
|
|||
|
# It's been tested that each data sample in one subcategory have same inference arguments.
|
|||
|
inference_kwargs = deepcopy(default_inference_kwargs)
|
|||
|
if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
|
|||
|
all_classes = get_all_classes(data["example"][0]["question"])
|
|||
|
inference_kwargs["all_classes"] = all_classes
|
|||
|
if subject in english_qa_datasets:
|
|||
|
inference_kwargs["language"] = "English"
|
|||
|
if subject in chinese_qa_datasets:
|
|||
|
inference_kwargs["language"] = "Chinese"
|
|||
|
|
|||
|
dataset["test"][subject]["inference_kwargs"] = inference_kwargs
|
|||
|
|
|||
|
for sample in data["example"]:
|
|||
|
# Convert multi-choice answers to a single string.
|
|||
|
# We will convert it back when evaluating.
|
|||
|
# We do this because if target is a list, it should be only used for multiple target answers.
|
|||
|
if subject in multi_choice_datasets:
|
|||
|
sample["answer"] = "".join(sample["answer"])
|
|||
|
|
|||
|
if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
|
|||
|
sample["answer"] = sample["answer"][0]
|
|||
|
|
|||
|
data_sample = {
|
|||
|
"dataset": "gaokaobench",
|
|||
|
"split": "test",
|
|||
|
"category": f"{category[:-10]}-{subject}",
|
|||
|
"instruction": sample["question"].strip() + "\n答案:",
|
|||
|
"input": "",
|
|||
|
"output": "",
|
|||
|
"target": sample["answer"],
|
|||
|
}
|
|||
|
|
|||
|
dataset["test"][subject]["data"].append(data_sample)
|
|||
|
|
|||
|
return dataset
|