ColossalAI/applications/ColossalEval/colossal_eval/dataset/mmlu.py

import copy
import csv
import os
from typing import Dict, List

from colossalai.logging import DistributedLogger

from .base import BaseDataset

default_inference_kwargs = {
    "calculate_loss": True,
    "all_classes": ["A", "B", "C", "D"],
    "language": "English",
    "pretrain": False,
    "max_new_tokens": 32,
}


def get_few_shot_data(data: List[Dict]):
    few_shot_data = []
    for i in data:
        few_shot_data.append(i["input"] + i["target"])
    return few_shot_data


class MMLUDataset(BaseDataset):
    """
    Dataset class for MMLU dataset.
    Data source: https://github.com/hendrycks/test
    This dataset class will convert the original dataset into the inference dataset.
    """

    @staticmethod
    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
        dataset = {"dev": {}, "test": {}}
        for split in ["dev", "test"]:
            files = os.listdir(os.path.join(path, split))
            files.sort()

            for file in files:
                subject = file[0 : -len(f"_{split}.csv")].split("_")
                subject = " ".join([word.title() if word != "us" else "US" for word in subject])

                file_dir = os.path.join(path, split, file)

                dataset[split][subject] = {"data": [], "inference_kwargs": {}}

                # It's been tested that each data sample in one subcategory have same inference arguments.
                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)

                if split == "test" and few_shot:
                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
                        dataset["dev"][subject]["data"]
                    )

                with open(file_dir, encoding="utf-8") as f:
                    reader = csv.reader(f)
                    for row in reader:
                        assert len(row) == 6
                        choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"
                        data_sample = {
                            "dataset": "mmlu",
                            "split": split,
                            "category": subject,
                            "instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",
                            "input": f"Question: {row[0]}\n{choices}\nAnswer: ",
                            "output": "",
                            "target": row[5],
                        }

                        dataset[split][subject]["data"].append(data_sample)

        return dataset
[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786) * Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com> 2023-09-24 15:14:11 +00:00			`import copy`
			`import csv`
			`import os`
			`from typing import Dict, List`

			`from colossalai.logging import DistributedLogger`

			`from .base import BaseDataset`

			`default_inference_kwargs = {`
			`"calculate_loss": True,`
			`"all_classes": ["A", "B", "C", "D"],`
			`"language": "English",`
			`"pretrain": False,`
			`"max_new_tokens": 32,`
			`}`


			`def get_few_shot_data(data: List[Dict]):`
			`few_shot_data = []`
			`for i in data:`
			`few_shot_data.append(i["input"] + i["target"])`
			`return few_shot_data`


			`class MMLUDataset(BaseDataset):`
			`"""`
			`Dataset class for MMLU dataset.`
			`Data source: https://github.com/hendrycks/test`
			`This dataset class will convert the original dataset into the inference dataset.`
			`"""`

			`@staticmethod`
			`def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:`
			`dataset = {"dev": {}, "test": {}}`
			`for split in ["dev", "test"]:`
			`files = os.listdir(os.path.join(path, split))`
			`files.sort()`

			`for file in files:`
			`subject = file[0 : -len(f"_{split}.csv")].split("_")`
			`subject = " ".join([word.title() if word != "us" else "US" for word in subject])`

			`file_dir = os.path.join(path, split, file)`

			`dataset[split][subject] = {"data": [], "inference_kwargs": {}}`

			`# It's been tested that each data sample in one subcategory have same inference arguments.`
			`dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)`

			`if split == "test" and few_shot:`
			`dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(`
			`dataset["dev"][subject]["data"]`
			`)`

			`with open(file_dir, encoding="utf-8") as f:`
			`reader = csv.reader(f)`
			`for row in reader:`
			`assert len(row) == 6`
			`choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"`
			`data_sample = {`
			`"dataset": "mmlu",`
			`"split": split,`
			`"category": subject,`
			`"instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",`
			`"input": f"Question: {row[0]}\n{choices}\nAnswer: ",`
			`"output": "",`
			`"target": row[5],`
			`}`

			`dataset[split][subject]["data"].append(data_sample)`

			`return dataset`