InternLM/internlm/data/dataset.py

import os
from typing import Dict

from torch.utils.data import ConcatDataset

from internlm.data.single_dataset import JsonlDataset


def get_dataset_dict(folder, split="valid") -> Dict:
    """
    Return a dictionary of Datasets from a folder containing data files for validation.

    Args:
        folder (str): The path to the folder containing data files.
        split (str): The split of the data files to be used, default is "valid".

    Returns:
        A dictionary containing Datasets for each folder in the given path
        that contains data files with the specified split.

    Raises:
        AssertionError: If the given folder does not exist.

    Example:
        If the given folder is as follows,
        - data
            - zhihu
                - xxx.bin
                - valid.bin
            - baike
                - xxx.bin
                - valid.bin

        The returned dictionary will be,
        {
            'zhihu': Dataset,
            'baike': Dataset
        }
    """

    assert os.path.exists(folder), f"folder `{folder}` not exists"
    data_dict = {}

    for root, dirs, files in os.walk(folder, followlinks=True):
        dirs.sort()  # The order is guaranteed, and the newly added data starting with z needs to be ranked behind
        datasets = []
        for fn in sorted(files):  # Need sorted to ensure that the order is consistent
            if fn.endswith(".bin") and split in fn:
                fp = os.path.join(root, fn)
                ds = JsonlDataset(fp)
                datasets.append(ds)
        if datasets:
            ds = ConcatDataset(datasets=datasets)
            data_dict[os.path.basename(root)] = ds

    return data_dict