ColossalAI/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py

123 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import os
import re
from copy import deepcopy
from typing import Dict, List
from colossalai.logging import DistributedLogger
from .base import BaseDataset
multi_choice_datasets = [
"Chinese Lang and Usage MCQs",
"Chinese Modern Lit",
"English Fill in Blanks",
"English Reading Comp",
"Geography MCQs",
"Physics MCQs",
"English Cloze Test",
]
chinese_qa_datasets = [
"Biology MCQs",
"Chemistry MCQs",
"Chinese Lang and Usage MCQs",
"Chinese Modern Lit",
"Geography MCQs",
"History MCQs",
"Math I MCQs",
"Math II MCQs",
"Physics MCQs",
"Political Science MCQs",
]
english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": None,
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
def get_all_classes(instruction: str):
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
pattern = r"([A-Z]\. |[A-Z]|[A-Z]\.)"
options = sorted(list(set(re.findall(pattern, instruction))))
options = sorted(list(set([string[0] for string in options])))
for i in range(len(options)):
if options[i] == letters[i]:
continue
else:
return options[0:i]
return options
class GaoKaoBenchDataset(BaseDataset):
"""
Dataset class for GAOKAO-Bench dataset.
Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
This dataset class will convert the original dataset into the inference dataset.
A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"test": {}}
for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
files = os.listdir(os.path.join(path, "data", category))
files.sort()
for file in files:
subject = file[10:-5].split("_")
subject = " ".join(subject)
dataset["test"][subject] = {"data": []}
file_dir = os.path.join(path, "data", category, file)
with open(file_dir, encoding="utf-8") as f:
data = json.load(f)
# It's been tested that each data sample in one subcategory have same inference arguments.
inference_kwargs = deepcopy(default_inference_kwargs)
if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
all_classes = get_all_classes(data["example"][0]["question"])
inference_kwargs["all_classes"] = all_classes
if subject in english_qa_datasets:
inference_kwargs["language"] = "English"
if subject in chinese_qa_datasets:
inference_kwargs["language"] = "Chinese"
dataset["test"][subject]["inference_kwargs"] = inference_kwargs
for sample in data["example"]:
# Convert multi-choice answers to a single string.
# We will convert it back when evaluating.
# We do this because if target is a list, it should be only used for multiple target answers.
if subject in multi_choice_datasets:
sample["answer"] = "".join(sample["answer"])
if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
sample["answer"] = sample["answer"][0]
data_sample = {
"dataset": "gaokaobench",
"split": "test",
"category": f"{category[:-10]}-{subject}",
"instruction": sample["question"].strip() + "\n答案:",
"input": "",
"output": "",
"target": sample["answer"],
}
dataset["test"][subject]["data"].append(data_sample)
return dataset